# Filtro de spam con Bayes Ingenuo
Basado en el este [artículo](https://towardsdatascience.com/spam-classifier-in-python-from-scratch-27a98ddd8e73)
Dataset obtenido de este [enlace](https://www.kaggle.com/uciml/sms-spam-collection-dataset)

## Cargando los datos

In [1]:
import pandas as pd
from math import log, sqrt
dataset = pd.read_csv("datasets/sms-spam/spam.csv", sep="\t", names=["label", "msg"])
dataset.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Reemplazando etiquetas por números

In [2]:
dataset["label"] = dataset["label"].map({"spam": 1, "ham": 0})
dataset.head()

Unnamed: 0,label,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Dividiendo dataset en train y test

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset["msg"], 
                                                    dataset["label"], 
                                                    test_size=0.2, 
                                                    random_state=42)
df_test = pd.DataFrame({"label": y_test, "msg": X_test})
df_train = pd.DataFrame({"label": y_train, "msg": X_train})

In [4]:
df_train.reset_index(inplace=True)
df_train.drop(["index"], axis=1, inplace=True)
df_train.head()

Unnamed: 0,label,msg
0,1,Reply to win £100 weekly! Where will the 2006 ...
1,0,Hello. Sort of out in town already. That . So ...
2,0,How come guoyang go n tell her? Then u told her?
3,0,Hey sathya till now we dint meet not even a si...
4,1,Orange brings you ringtones from all time Char...


In [5]:
df_train['label'].value_counts()

0    3859
1     598
Name: label, dtype: int64

In [6]:
df_test.reset_index(inplace=True)
df_test.drop(["index"], axis=1, inplace=True)
df_test.head()

Unnamed: 0,label,msg
0,0,Squeeeeeze!! This is christmas hug.. If u lik ...
1,0,And also I've sorta blown him off a couple tim...
2,0,Mmm thats better now i got a roast down me! i...
3,0,Mm have some kanji dont eat anything heavy ok
4,0,So there's a ring that comes with the guys cos...


In [7]:
df_test['label'].value_counts()

0    966
1    149
Name: label, dtype: int64

## Preprocesamiento, lematizacion y stemming

In [8]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
nltk.download('wordnet')  # Para la lematizacion

def lematizacion_stemming(texto):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(texto, pos='v'))

def preprocesamiento(texto):
    return [lematizacion_stemming(token) for token in simple_preprocess(texto) 
            if token not in STOPWORDS and len(token) > 3]

[nltk_data] Downloading package wordnet to /home/umoqnier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## TF-IDF y Entrenamiento 

In [9]:
mails, labels = df_train['msg'], df_train['label']
total_messages = mails.shape[0]
total_spam = labels.value_counts()[1]  # Dónde 1: spam y 0: ham
total_ham = labels.value_counts()[0]
print(total_ham, '+', total_spam, '=', total_ham + total_spam, '==', total_messages)

3859 + 598 = 4457 == 4457


### Comienza TF-IDF

In [10]:
spam_words, ham_words = 0, 0
tf_spam, tf_ham = dict(), dict()
idf_spam, idf_ham = dict(), dict()
for i, msg in enumerate(mails):
    occurrences = list()  # For IDF
    msg_processed = preprocesamiento(msg)
    for word in msg_processed:
        if labels[i]:
            tf_spam[word] = tf_spam.get(word, 0) + 1
            spam_words += 1
        else:
            tf_ham[word] = tf_ham.get(word, 0) + 1
            ham_words += 1
        if word not in occurrences:
            occurrences += [word]
        
        for word in occurrences:
            if labels[i]:
                idf_spam[word] = idf_spam.get(word, 0) + 1
            else:
                idf_ham[word] = idf_ham.get(word, 0) + 1

### Calculando probabilidades

In [11]:
prob_spam, prob_ham = dict(), dict()
tfidf_spam, tfidf_ham = 0, 0
for word in tf_spam:
    prob_spam[word] = (tf_spam[word]) * log(total_messages / (idf_spam[word] + idf_ham.get(word, 0)) + 1)
    tfidf_spam += prob_spam[word]
for word in tf_spam:
    prob_spam[word] = (prob_spam[word] + 1) / (tfidf_spam + len(list(prob_spam.keys())))


for word in tf_ham:
    prob_ham[word] = (tf_ham[word]) * log(total_messages / (idf_spam.get(word, 0) + idf_ham[word]) + 1)
    tfidf_ham += prob_ham[word]
for word in tf_ham:
    prob_ham[word] = (prob_ham[word] + 1) / (tfidf_ham + len(list(prob_ham.keys())))    

prob_spam_mail = tfidf_spam / total_messages
prob_ham_mail = tfidf_ham / total_messages

## Haciendo predicciones

In [12]:
results = dict()
def clasificador(texto, **ka):
    p_spam, p_ham = 0, 0
    for palabra in texto:
        if palabra in ka['ps']:
            p_spam += log(ka['ps'][palabra])
        else:
            p_spam -= log(ka['tis'] + len(list(ka['ps'].keys())))
            
        if palabra in ka['ph']:
            p_ham += log(ka['ph'][palabra])
        else:
            p_ham -= log(ka['tih'] + len(list(ka['ph'].keys())))
        p_spam += log(ka['psm'])
        p_ham += log(ka['phm'])
    return "spam" if p_spam >= p_ham else "ham"

for i, msg in enumerate(df_test["msg"]):
    msg_preprocessed = preprocesamiento(msg)
    results[i] = clasificador(msg_preprocessed, psm=prob_spam_mail, ps=prob_spam, 
                             tis=tfidf_spam, phm=prob_ham_mail, ph=prob_ham, tih=tfidf_ham)

In [13]:
for msg_i, tag in zip(results.keys(), results.values()):
    print("El mensaje:", df_test['msg'][msg_i][:40], "... es >>", tag.upper())
    if msg_i > 20:
        break

El mensaje: Squeeeeeze!! This is christmas hug.. If  ... es >> HAM
El mensaje: And also I've sorta blown him off a coup ... es >> HAM
El mensaje: Mmm thats better now i got a roast down  ... es >> HAM
El mensaje: Mm have some kanji dont eat anything hea ... es >> HAM
El mensaje: So there's a ring that comes with the gu ... es >> HAM
El mensaje: Sary just need Tim in the bollox &it hur ... es >> HAM
El mensaje: Love isn't a decision, it's a feeling. I ... es >> HAM
El mensaje: My supervisor find 4 me one lor i thk hi ... es >> HAM
El mensaje: Dear good morning now only i am up ... es >> HAM
El mensaje: I'm in chennai velachery:) ... es >> HAM
El mensaje: Lol grr my mom is taking forever with my ... es >> HAM
El mensaje: No other Valentines huh? The proof is on ... es >> HAM
El mensaje: I'm wif him now buying tix lar... ... es >> HAM
El mensaje: Er, hello, things didn‘t quite go to pla ... es >> HAM
El mensaje: FREE RINGTONE text FIRST to 87131 for a  ... es >> SPAM
El mensaje: Sir, i am

## Probando mensajes no vistos

In [14]:
ham_test = preprocesamiento("Hi, i'm a student from mexico and need your help on this issue because the program don't works fine for my operating system")
clasificador(ham_test, psm=prob_spam_mail, ps=prob_spam, tis=tfidf_spam, phm=prob_ham_mail, ph=prob_ham, tih=tfidf_ham)

'ham'

In [15]:
spam_test = preprocesamiento("Congratulations ur awarded $500")
clasificador(spam_test, psm=prob_spam_mail, ps=prob_spam, tis=tfidf_spam, phm=prob_ham_mail, ph=prob_ham, tih=tfidf_ham)

'spam'

## Metricas del modelo

In [33]:
def metricas(real, predicciones):
    v_pos, v_neg, f_pos, f_neg = 0, 0, 0, 0
    for i in range(len(real)):
        v_pos += int(real[i] == 1 and predicciones[i] == 1)
        v_neg += int(real[i] == 0 and predicciones[i] == 0)
        f_pos += int(real[i] == 0 and predicciones[i] == 1)
        f_neg += int(real[i] == 1 and predicciones[i] == 0)
    precision = v_pos / (v_pos + f_pos)
    recall = v_pos / (v_pos + f_neg)
    f_score = 2 * precision * recall / (precision + recall)
    exactitud = (v_pos + v_neg) / (v_pos + v_neg + f_pos + f_neg)
    
    print("***Metricas***")
    print("precision ::", precision)
    print("recall ::", recall)
    print("F-score ::", f_score)
    print("exactitud ::", exactitud)

In [34]:
df_results = pd.DataFrame(results.items(), columns=["index", "label"])
metricas(df_test["label"], df_results["label"].map({"ham": 0, "spam": 1})) # TODO

***Metricas***
precision :: 0.7735849056603774
recall :: 0.825503355704698
F-score :: 0.7987012987012987
exactitud :: 0.9443946188340807
