In [70]:
import math
import os
import zipfile
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

Leer el corpus de los archivos descagados, donde estan clasificados los correos spam y no-spam

In [2]:
! cd ../

In [3]:
! pwd

/home/cesar/Documents/master/ia/nlp/aplicaciones


Descomprimir el corpus

In [4]:
for file in os.listdir('../data/datasets/email/plaintext/'):
    try:
        with zipfile.ZipFile('../data/datasets/email/plaintext/'+file,'r') as zipobj:
            zipobj.extractall('../data/datasets/email/plaintext/')
    except Exception as e:
        print(e)

[Errno 21] Is a directory: '../data/datasets/email/plaintext/enron5'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/enron2'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/corpus2'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/enron1'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/.ipynb_checkpoints'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/corpus1'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/enron3'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/corpus3'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/enron6'
[Errno 21] Is a directory: '../data/datasets/email/plaintext/enron4'


Leer el corpus

In [5]:
files_path = [i for i in os.listdir('../data/datasets/email/plaintext/') if not i.endswith('.zip') and not i.startswith('.')]

In [10]:
data = []
clases = []
# Spam
for dir_p in files_path:
    for file in os.listdir('../data/datasets/email/plaintext/'+dir_p+'/spam'):
        if not file.startswith('.'):
            with open('../data/datasets/email/plaintext/'+dir_p+'/spam/'+file, encoding='latin-1') as f:
                data.append(f.read())
                clases.append('spam')
# No-Spam
for dir_p in files_path:
    for file in os.listdir('../data/datasets/email/plaintext/'+dir_p+'/ham'):
        if not file.startswith('.'):
            with open('../data/datasets/email/plaintext/'+dir_p+'/ham/'+file, encoding='latin-1') as f:
                data.append(f.read())
                clases.append('ham')

# Creo el modelo

In [11]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

Pruebo el tokenizador

In [16]:
[t.text for t in tokenizer(data[0])]

['Subject:',
 'all',
 'for',
 'free',
 '!',
 'check',
 'out',
 'these',
 '100',
 '%',
 'free',
 'adult',
 'sites',
 '!',
 '!',
 '!',
 '\n',
 '100',
 '%',
 'free',
 'porn',
 '!',
 '\n',
 'what',
 'more',
 'can',
 'you',
 'ask',
 'for',
 '?',
 '\n',
 'click',
 'here',
 '\n',
 'removal',
 'instructions',
 ':',
 'we',
 'strive',
 'to',
 'never',
 'send',
 'unsolicited',
 'mail',
 '.',
 '\n',
 'however',
 ',',
 'if',
 'you',
 "'",
 'd',
 'rather',
 'not',
 'receive',
 'future',
 'e',
 '-',
 'mails',
 'from',
 'us',
 ',',
 '\n',
 'click',
 'here',
 'to',
 'send',
 'email',
 'and',
 'add',
 'the',
 'word',
 'remove',
 'in',
 'the',
 'subject',
 'line',
 '.',
 '\n',
 'please',
 'allow',
 '48',
 'hours',
 'for',
 'processing',
 '.',
 '\n',
 '[',
 'j',
 '7',
 'bjk',
 '9',
 '^',
 '"',
 ':',
 '}',
 'h',
 '&',
 '*',
 'tgobk',
 '5',
 'nkiys',
 '5',
 ']']

### Clase principal para el algoritmo

Recuerda que la clase más probable viene dada por (en espacio de cómputo logarítmico): 


$$\hat{c} = {\arg \max}_{(c)}\log{P(c)}
 +\sum_{i=1}^n
\log{ P(f_i \vert c)}
$$

Donde, para evitar casos atípicos, usaremos el suavizado de Laplace así:

$$
P(f_i \vert c) = \frac{C(f_i, c)+1}{C(c) + \vert V \vert}
$$

siendo $\vert V \vert$ la longitud del vocabulario de nuestro conjunto de entrenamiento. 

In [57]:
class NaiveBayesClassifier():
    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)

    def tokenize(self,data):
        return [t.text.lower() for t in tokenizer(data)]
    
    def words_counts(self,words):
        wordsCounts = {}
        for word in words:
            if word in wordsCounts.keys():
                wordsCounts[word] += 1
            else :
                wordsCounts[word] = 1
        return wordsCounts
    
    def fit(self, data, clases):
        n = len(data)
        self.unique_clases = set(clases)
        self.vocab = set()
        self.classCount = {} # C(c)
        self.log_classPriorProb = {} # log(P(c))
        self.wordCoditionalCounts = {} # C(w|c)
        
        #Conteo de las clases
        self.classCount = self.words_counts(clases)
            
        #Calculo de la probabilidad c
        for c in self.classCount.keys():
            #La probabilidad de la clase
            self.log_classPriorProb[c] = math.log(self.classCount[c]/n)
            #Probabilidades condicionales de las palabras por cada clase
            self.wordCoditionalCounts[c] = {}
        #C(w|c)
        for text, c in zip(data,clases):
            counts = self.words_counts(self.tokenize(text))
            
            for word, count in counts.items():
                # add las palabras al vocabulario
                if word not in self.vocab:
                    self.vocab.add(word)
                # add las palabras a el conteo condicional
                if word not in self.wordCoditionalCounts[c]:
                    self.wordCoditionalCounts[c][word] = 0.0
                self.wordCoditionalCounts[c][word] += count
    
    def predict(self, data):
        results = []
        for text in data:
            words = set(self.tokenize(text))
            scoreProb = {}
            for word in words:
                if word not in self.vocab: continue
                # Suavizado de laplace
                for c in self.unique_clases:
                    log_wordClassProb = math.log(
                        (self.wordCoditionalCounts[c].get(word,0.0)+1) / (self.classCount[c]+len(self.vocab))  )
                    scoreProb[c] = scoreProb.get(c,self.log_classPriorProb[c]) + log_wordClassProb
            # Tomar la maxima probabilidad
            arg_maxprob = np.argmax(np.array(list(scoreProb.values())))
            results.append(list(scoreProb.keys())[arg_maxprob])
        return results
                    

# Probar el modelo

In [47]:
data_train, data_test, clases_train, clases_test = train_test_split(data,clases, shuffle=True, test_size=0.2)

In [58]:
classifier = NaiveBayesClassifier()
classifier.fit(data_train, clases_train)

In [59]:
predict = classifier.predict(data_test)

# Metricas

In [60]:
accuracy_score(predict,clases_test)

0.990612204134625

In [65]:
precision_score(predict,clases_test, average=None, zero_division=1)

array([0.9912873 , 0.99004058])

De todos los ham que logré predecir el 99.12% es verdadero, de igual forma con el spam, de todos los spam que logré predecir el 99.04% fue verdadero

In [69]:
recall_score(predict,clases_test, average=None, zero_division=1)

array([0.98827362, 0.99260355])

Por otro lado en el Recall, de todos los ham del dataset logré capturar el 98.82%, y de todos los spam en el dataset se logró capturar el 99.26%

In [86]:
clases_test.count('ham')

4591

In [71]:
labels = list(set(clases_test))

In [94]:
labels

['ham', 'spam']

In [98]:
matrix = confusion_matrix(predict,clases_test, labels=['spam','ham'])

In [99]:
matrix

array([[5368,   40],
       [  54, 4551]])

In [101]:
df = pd.DataFrame(matrix, index = ['spam','ham'], columns = ['spam','ham'])

In [102]:
df

Unnamed: 0,spam,ham
spam,5368,40
ham,54,4551


$$\Large Precision = \frac{TP}{TP+FP}$$

* Cuántos de los correos clasificados como spam, realmente lo son ?

In [103]:
5368/(5368+40)

0.992603550295858

$$\Large Recall = \frac{TP}{TP+FN}$$

* Cuántos correos Spam lograron identificarse ?

In [104]:
5368/(5368+54)

0.9900405754334194