**Procesamos HTML**

In [1]:
from html.parser import HTMLParser  #Módulo de Python para procesar HTML
#help(HTMLParser)
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()                 #Restablecimiento del estado interno del analizador HTML, lo limpia y lo prepara
        self.strict = False          #Analiza el html aunque este mal formteado
        self.convert_charrefs = True #Convierte caracteres especiales a su equivalente unicode
        self.fed = []
    def handle_data(self,d):
        self.fed.append(d)
    def get_data(self):
        return "".join(self.fed)

In [2]:
def strip_tags(html):  #Elimina todas las etiquetas
    s = MLStripper()
    s.feed(html)
    return s.get_data()


In [3]:
#Cadena HTML
t = """
<!DOCTYPE html>
<html lang="es">
<head>
  <meta charset="utf-8">
  <title>HTML</title>
</head>
<body>
  <p>Este es un ejemplo simple.</p>
</body>
</html>
"""
#Cadena sin etiquetas
print(strip_tags(t))





  
  HTML


  Este es un ejemplo simple.





**Procesamos string con NLTK**

In [4]:
import nltk
from nltk.stem import PorterStemmer #sacar la raiz 

nltk.download("punkt") #Descarga de recursos para tokenizar 
stemmer = PorterStemmer()
sentence = "I love eating pizza with my friends"
#tokenizar (separa palabras en una lista)
words = nltk.word_tokenize(sentence)
print(words)
#aplicamos stemming a cada palabra de la frase
stemmed_word = [stemmer.stem(word) for word in words]
print(stemmed_word)

['I', 'love', 'eating', 'pizza', 'with', 'my', 'friends']
['i', 'love', 'eat', 'pizza', 'with', 'my', 'friend']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Uusario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Palabras vacías**

In [5]:
import nltk
from nltk.corpus import stopwords #contiene recursos para procesar palabras vacías
nltk.download("punkt") #Descarga de recursos para tokenizar 
nltk.download("stopwords") #Descarga recursos para reconocer palabras vacías

#definir una frase de ejemplo
sentence = "the quick brown fox jumps over the lazy dog and enjoys the beautiful scenery"

#tokenizar (separa palabras en una lista)
words = nltk.word_tokenize(sentence)
print(words)

english_words = stopwords.words("english")
#print(english_words)

#filtrar palabras que no son vacías
filtered_words = [ word for word in words if word.lower() not in  english_words]
print(filtered_words)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'and', 'enjoys', 'the', 'beautiful', 'scenery']
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', 'enjoys', 'beautiful', 'scenery']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Uusario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Uusario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Caracteres especiales**

In [6]:
import string
(string.punctuation)

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
import string

class TextProcessor:
    def __init__(self):
        self.punctuation = (string.punctuation)
        
processor = TextProcessor()
print(processor.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


**Email**

In [8]:
import email
with open("ejemplo_correo.txt","r") as file:
    email_content = file.read()

msg = email.message_from_string(email_content)
print(msg)

from_address = msg["FROM"]
to_address = msg["To"]
subject = msg["Subject"]
body = msg.get_payload()

#mostrar información
print("Remitente :", from_address)
print("Destinatario :", to_address)
print("Asunto :", subject)
print("Cuerpo del mensaje :", body)

From: sender@example.com
To: recipient@example.com
Subject: Ejemplo de Correo

Hola,

Este es un ejemplo de correo electrÃ³nico.

Saludos,
Remitente

Remitente : sender@example.com
Destinatario : recipient@example.com
Asunto : Ejemplo de Correo
Cuerpo del mensaje : Hola,

Este es un ejemplo de correo electrÃ³nico.

Saludos,
Remitente



In [9]:
import re

def strip_tags(html_text):
    clean_text = re.sub("<[^<]+?>",'',html_text)
    return clean_text

correo = """
<html>
    <body>
    <h1>Este es un correo</h1>
    <p>Este es un parrafo</p>
    </body>
</html>
"""
print(strip_tags(correo))



    
    Este es un correo
    Este es un parrafo
    




**Código de preprocesamiento**

*Elimina HTML, tokeniza, reduce a raíz, quita palabras vacías*

In [57]:
from html.parser import HTMLParser  #Módulo de Python para procesar HTML
#help(HTMLParser)
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()                 #Restablecimiento del estado interno del analizador HTML, lo limpia y lo prepara
        self.strict = False          #Analiza el html aunque este mal formteado
        self.convert_charrefs = True #Convierte caracteres especiales a su equivalente unicode
        self.fed = []
    def handle_data(self,d):
        self.fed.append(d)
    def get_data(self):
        return "".join(self.fed)

In [58]:
def strip_tags(html):  #Elimina todas las etiquetas
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [59]:
import email
import string
import nltk

class Parser:
    def __init__(self):
        self.stemmer = nltk.PorterStemmer() #raíz
        self.stopwords = set(nltk.corpus.stopwords.words("english")) #palabras vacías
        self.punctuation = list(string.punctuation) #signos de puntuación
    def parser(self,email_path):
        with open(email_path, errors = "ignore") as e:
            msg = email.message_from_file(e)
        return None if not msg else self.get_email_content(msg)
    def get_email_content(self,msg):
        """Extrae el contenido del EMAIL"""
        subject = self.tokenize(msg["subject"] if msg["subject"] else [])
        body = self.get_email_body(
                                    msg.get_payload(), 
                                    msg.get_content_type()
                                )
        content_type = msg.get_content_type()
        return {
                    "Subject" : subject,
                    "body" : body,
                    "content_type" : content_type
                }
        
    def tokenize(self,text):
        for c in self.punctuation:
            text = text.replace(c,"")
        text = text.replace("\t"," ")
        text = text.replace("\n"," ")
        tokens = list(filter(None,text.split(" ")))
        return [ self.stemmer.stem(w) for w in tokens if w not in self.stopwords ]
    def get_email_body(self,payload,content_type):
        """Extraemos el cuerpo"""
        body = []
        if type(payload) is str and content_type == "text/plain":
            return self.tokenize(payload)
        elif type(payload) is str and content_type == "text/html":
            return self.tokenize(strip_tags(payload))
        elif type(payload) is list:
            for p in payload:
                body += self.get_email_body( p.get_payload(), p.get_content_type())
        return body

**lectura de correos**

In [60]:
inmail = open("datasets\\trec07p\\data\\inmail.1").read()
print(inmail)

From RickyAmes@aol.com  Sun Apr  8 13:07:32 2007
Return-Path: <RickyAmes@aol.com>
Received: from 129.97.78.23 ([211.202.101.74])
	by speedy.uwaterloo.ca (8.12.8/8.12.5) with SMTP id l38H7G0I003017;
	Sun, 8 Apr 2007 13:07:21 -0400
Received: from 0.144.152.6 by 211.202.101.74; Sun, 08 Apr 2007 19:04:48 +0100
Message-ID: <WYADCKPDFWWTWTXNFVUE@yahoo.com>
From: "Tomas Jacobs" <RickyAmes@aol.com>
Reply-To: "Tomas Jacobs" <RickyAmes@aol.com>
To: the00@speedy.uwaterloo.ca
Subject: Generic Cialis, branded quality@ 
Date: Sun, 08 Apr 2007 21:00:48 +0300
X-Mailer: Microsoft Outlook Express 6.00.2600.0000
MIME-Version: 1.0
Content-Type: multipart/alternative;
	boundary="--8896484051606557286"
X-Priority: 3
X-MSMail-Priority: Normal
Status: RO
Content-Length: 988
Lines: 24

----8896484051606557286
Content-Type: text/html;
Content-Transfer-Encoding: 7Bit

<html>
<body bgcolor="#ffffff">
<div style="border-color: #00FFFF; border-right-width: 0px; border-bottom-width: 0px; margin-bottom: 0px;" align="

In [61]:
p = Parser()
p.parser("datasets\\trec07p\\data\\inmail.1")

{'Subject': ['gener', 'ciali', 'brand', 'qualiti'],
 'body': ['do',
  'feel',
  'pressur',
  'perform',
  'rise',
  'occas',
  'tri',
  'viagra',
  'anxieti',
  'thing',
  'past',
  'back',
  'old',
  'self'],
 'content_type': 'multipart/alternative'}

In [62]:
#Lectura del indice
index = open("datasets\\trec07p\\full\\index").readlines()
index[:10]

['spam ../data/inmail.1\n',
 'ham ../data/inmail.2\n',
 'spam ../data/inmail.3\n',
 'spam ../data/inmail.4\n',
 'spam ../data/inmail.5\n',
 'spam ../data/inmail.6\n',
 'spam ../data/inmail.7\n',
 'spam ../data/inmail.8\n',
 'spam ../data/inmail.9\n',
 'ham ../data/inmail.10\n']

In [63]:
import os
DATASET_PATH = "datasets/trec07p"

def parse_index(path_to_index, n_elementos):
    ret_indexes = []
    index = open(path_to_index).readlines()
    for i in range(n_elementos):
        mail = index[i].split(" ../")
        label = mail[0]
        path = mail[1][:-1]
        ret_indexes.append({"label":label,"email_path": os.path.join(DATASET_PATH,path)})
    return ret_indexes

In [64]:
indexes = parse_index("datasets\\trec07p\\full\\index",10)
indexes

[{'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.1'},
 {'label': 'ham', 'email_path': 'datasets/trec07p\\data/inmail.2'},
 {'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.3'},
 {'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.4'},
 {'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.5'},
 {'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.6'},
 {'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.7'},
 {'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.8'},
 {'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.9'},
 {'label': 'ham', 'email_path': 'datasets/trec07p\\data/inmail.10'}]

In [65]:
open("datasets\\trec07p\\data\\inmail.1").read()

'From RickyAmes@aol.com  Sun Apr  8 13:07:32 2007\nReturn-Path: <RickyAmes@aol.com>\nReceived: from 129.97.78.23 ([211.202.101.74])\n\tby speedy.uwaterloo.ca (8.12.8/8.12.5) with SMTP id l38H7G0I003017;\n\tSun, 8 Apr 2007 13:07:21 -0400\nReceived: from 0.144.152.6 by 211.202.101.74; Sun, 08 Apr 2007 19:04:48 +0100\nMessage-ID: <WYADCKPDFWWTWTXNFVUE@yahoo.com>\nFrom: "Tomas Jacobs" <RickyAmes@aol.com>\nReply-To: "Tomas Jacobs" <RickyAmes@aol.com>\nTo: the00@speedy.uwaterloo.ca\nSubject: Generic Cialis, branded quality@ \nDate: Sun, 08 Apr 2007 21:00:48 +0300\nX-Mailer: Microsoft Outlook Express 6.00.2600.0000\nMIME-Version: 1.0\nContent-Type: multipart/alternative;\n\tboundary="--8896484051606557286"\nX-Priority: 3\nX-MSMail-Priority: Normal\nStatus: RO\nContent-Length: 988\nLines: 24\n\n----8896484051606557286\nContent-Type: text/html;\nContent-Transfer-Encoding: 7Bit\n\n<html>\n<body bgcolor="#ffffff">\n<div style="border-color: #00FFFF; border-right-width: 0px; border-bottom-width: 0

In [66]:
def parse_email(index):
    p = Parser()
    pemail = p.parser(index["email_path"])
    return pemail,index["label"]

In [67]:
mail, label = parse_email(indexes[0])
print("El correo es: ",label)
print(mail)

El correo es:  spam
{'Subject': ['gener', 'ciali', 'brand', 'qualiti'], 'body': ['do', 'feel', 'pressur', 'perform', 'rise', 'occas', 'tri', 'viagra', 'anxieti', 'thing', 'past', 'back', 'old', 'self'], 'content_type': 'multipart/alternative'}


**CountVectorizer**

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
#preparar el email 
prep_email = [ " ".join(mail["Subject"]) + " ".join(mail["body"]) ]
vectorizer = CountVectorizer()
vectorizer.fit(prep_email)
print("Email",prep_email)
print("Caracteristicas de entrada",vectorizer.get_feature_names_out())

Email ['gener ciali brand qualitido feel pressur perform rise occas tri viagra anxieti thing past back old self']
Caracteristicas de entrada ['anxieti' 'back' 'brand' 'ciali' 'feel' 'gener' 'occas' 'old' 'past'
 'perform' 'pressur' 'qualitido' 'rise' 'self' 'thing' 'tri' 'viagra']


In [69]:
x = vectorizer.transform(prep_email)
print(x.toarray())

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


**oneHotEncondig**

In [70]:
#Convertimos variable categorica a un vector binario donde todos los valores son cero a excepción del correspondiente valor categorico
from sklearn.preprocessing import OneHotEncoder
prep_email = [ [w] for w in mail["Subject"] + mail["body"] ]
#print(prep_email)
enc = OneHotEncoder()
x = enc.fit_transform(prep_email)
print(enc.get_feature_names_out())
print(x.toarray())

['x0_anxieti' 'x0_back' 'x0_brand' 'x0_ciali' 'x0_do' 'x0_feel' 'x0_gener'
 'x0_occas' 'x0_old' 'x0_past' 'x0_perform' 'x0_pressur' 'x0_qualiti'
 'x0_rise' 'x0_self' 'x0_thing' 'x0_tri' 'x0_viagra']
[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0

In [71]:
#Función auxiliar para procesar los datos y para leer una cantidad x de correos
def create_prep_dataset(index_path, n_elements):
    x = [] 
    y = [] 
    indexes = parse_index(index_path,n_elements)
    for i in range(n_elements):
        mail,label = parse_email(indexes[i])
        x.append(" ".join(mail["Subject"]) + " ".join(mail["body"]))
        y.append(label)
    return x,y

In [72]:
x_train, y_train = create_prep_dataset("datasets\\trec07p\\full\\index",200)
x_train

['gener ciali brand qualitido feel pressur perform rise occas tri viagra anxieti thing past back old self',
 'typo debianreadmhi ive updat gulu i check mirror it seem littl typo debianreadm file exampl httpgulususherbrookecadebianreadm ftpftpfrdebianorgdebianreadm test lenni access releas diststest the current test develop snapshot name etch packag test unstabl pass autom test propog releas etch replac lenni like readmehtml yan morin consult en logiciel libr yanmorinsavoirfairelinuxcom 5149941556 to unsubscrib email debianmirrorsrequestlistsdebianorg subject unsubscrib troubl contact listmasterlistsdebianorg',
 'authent viagramega authenticv i a g r a discount pricec i a l i s discount pricedo miss it click httpwwwmoujsjkhchumcom authent viagra mega authenticv i a g r a discount pricec i a l i s discount pricedo miss it click',
 'nice talk yahey billi realli fun go night talk said felt insecur manhood i notic toilet quit small area worri websit i tell secret weapon extra 3 inch trust g

In [73]:
#Aplicamos la vectorización
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(x_train)

In [74]:
print(X_train.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [75]:
import pandas as pd
pd.DataFrame(X_train.toarray(), columns = [vectorizer.get_feature_names_out()] )

Unnamed: 0,000,0000,000000,000713,00085,002,003,00450,0089,009,...,ü³á,ü¼¼êõ,ýðåï,þªø,þ²y²ä,þîñæ,þîñòµ¼,þîñôú,šè,ˆ400
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
y_train

['spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam'

**1. Entrenamiento del algoritmo de regresión logística**

In [77]:
#Con el conjunto de datos preparados
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train) #Crea el modelo

**2. Predicción**

In [79]:
X, Y = create_prep_dataset("datasets\\trec07p\\full\\index",250)
X_test = X[200:]
Y_test = Y[200:]
#Vectorizamos
X_test = vectorizer.transform(X_test)
#predicción
y_pred= clf.predict(X_test)
y_pred

array(['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam',
       'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam',
       'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam',
       'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam',
       'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam',
       'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham',
       'spam', 'spam'], dtype='<U4')

In [80]:
Y_test

['spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam']

In [81]:
from sklearn.metrics import accuracy_score
print("Accuracy: {:.3f}".format(accuracy_score(Y_test,y_pred)))

Accuracy: 0.960
