In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Abrindo e entendendo os dados

In [2]:
with open('SMSSpamCollection') as file:
    tenlines = file.readlines()[:10]
    for line in tenlines:
        print(line)
tenlines    

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

ham	Ok lar... Joking wif u oni...

spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

ham	U dun say so early hor... U c already then say...

ham	Nah I don't think he goes to usf, he lives around here though

spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv

ham	Even my brother is not like to speak with me. They treat me like aids patent.

ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune

spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only

['ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n',
 'ham\tOk lar... Joking wif u oni...\n',
 "spam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n",
 'ham\tU dun say so early hor... U c already then say...\n',
 "ham\tNah I don't think he goes to usf, he lives around here though\n",
 "spam\tFreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv\n",
 'ham\tEven my brother is not like to speak with me. They treat me like aids patent.\n',
 "ham\tAs per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\n",
 'spam\tWINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call

Temos um arquivo onde cada linha é uma mensagen com sua classificação em ham ou spam, separados por uma tabulação '\t'. Logo podemos transformar isso em um dataset com o Pandas

In [3]:
data = pd.read_csv('SMSSpamCollection', sep='\t', names = ['class', 'msg'])

In [4]:
data.head()

Unnamed: 0,class,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.groupby('class').describe()

Unnamed: 0_level_0,msg,msg,msg,msg
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


# Tratamento dos dados

Vamos tratar os dados, removendo mensagens repetidas, as pontuações e palavras que não agregam na análise conhecidas como stopwords

In [6]:
data.drop_duplicates(inplace=True) #Removendo Duplicatas

In [7]:
data.groupby('class').describe()

Unnamed: 0_level_0,msg,msg,msg,msg
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4516,4516,Black shirt n blue jeans... I thk i c ü...,1
spam,653,653,Hard LIVE 121 chat just 60p/min. Choose your g...,1


In [8]:
def removedor(mensagem):
    '''Remove pontuação e stopwords de uma mensagem'''
    for i in string.punctuation:#Removendo pontuação
        if i in mensagem:
            mensagem = mensagem.replace(i,'')
    
    stop_removed = []
    for i in mensagem.split(): #Removendo stopwords
        if i.lower() not in stopwords.words('english'):
            stop_removed.append(i.lower())
    
    return ' '.join(stop_removed) #Retorna o texto

In [9]:
data['msg'] = data['msg'].apply(removedor) #Aplicando o removedor

# Preparação para o modelo

In [10]:
spm_unig = CountVectorizer().fit(data['msg']) #Unigramas
msg_unig = spm_unig.transform(data['msg'])

In [11]:
spm_bi = CountVectorizer(ngram_range=(2, 2)).fit(data['msg']) #Bigramas
msg_bi = spm_bi.transform(data['msg'])

In [12]:
spm_tfidf = TfidfTransformer(norm="l1").fit(msg_unig)#tfidf com unigramas
msg_tfidf = spm_tfidf.transform(msg_unig) 

In [13]:
print(msg_unig.shape,msg_bi.shape, msg_tfidf.shape)

(5169, 9437) (5169, 32056) (5169, 9437)


In [14]:
encoder = LabelEncoder()
y = encoder.fit_transform(data['class'])
print(y) # 0 para ham e 1 para spam

[0 0 1 ... 0 0 0]


Criadas as matrizes esparsas vamos ao modelo

# Modelos

In [15]:
X_train_unig, X_test_unig, y_train_unig, y_test_unig = train_test_split(msg_unig, y, test_size=0.3, random_state=19)
X_train_bi, X_test_bi, y_train_bi, y_test_bi= train_test_split(msg_bi, y,  test_size=0.3, random_state=19)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(msg_tfidf, y, test_size=0.3, random_state=19)

In [16]:
unig_log_reg = LogisticRegression().fit(X_train_unig,y_train_unig)
bi_log_reg = LogisticRegression().fit(X_train_bi,y_train_bi)
tfidf_log_reg = LogisticRegression().fit(X_train_tfidf,y_train_tfidf)

In [17]:
unig_log_pred = unig_log_reg.predict(X_test_unig)
bi_log_pred = bi_log_reg.predict(X_test_bi)
tfidf_log_pred = tfidf_log_reg.predict(X_test_tfidf)

In [18]:
print('Regressão logística para unigramas:\n')
print(classification_report(y_test_unig, unig_log_pred, target_names = ['ham','spam']))
print('\nRegressão logística para bigramas:\n')
print(classification_report(y_test_bi, bi_log_pred, target_names = ['ham','spam']))
print('\nRegressão logística para tfidf:\n')
print(classification_report(y_test_tfidf, tfidf_log_pred, target_names = ['ham','spam']))

Regressão logística para unigramas:

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1333
        spam       0.98      0.80      0.88       218

    accuracy                           0.97      1551
   macro avg       0.98      0.90      0.93      1551
weighted avg       0.97      0.97      0.97      1551


Regressão logística para bigramas:

              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      1333
        spam       1.00      0.44      0.61       218

    accuracy                           0.92      1551
   macro avg       0.96      0.72      0.78      1551
weighted avg       0.93      0.92      0.91      1551


Regressão logística para tfidf:

              precision    recall  f1-score   support

         ham       0.86      1.00      0.92      1333
        spam       0.00      0.00      0.00       218

    accuracy                           0.86      1551
   macro avg       0

In [19]:
unig_forest = RandomForestClassifier().fit(X_train_unig,y_train_unig)
bi_forest = RandomForestClassifier().fit(X_train_bi,y_train_bi)
tfidf_forest = RandomForestClassifier().fit(X_train_tfidf,y_train_tfidf)

In [20]:
unig_forest_pred = unig_forest.predict(X_test_unig)
bi_forest_pred = bi_forest.predict(X_test_bi)
tfidf_forest_pred = tfidf_forest.predict(X_test_tfidf)

In [21]:
print('Random Forest para unigramas:\n')
print(classification_report(y_test_unig, unig_forest_pred, target_names = ['ham','spam']))
print('\nRandom Forest para bigramas:\n')
print(classification_report(y_test_bi, bi_forest_pred, target_names = ['ham','spam']))
print('\nRandom Forest para tfidf:\n')
print(classification_report(y_test_tfidf, tfidf_forest_pred, target_names = ['ham','spam']))

Random Forest para unigramas:

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1333
        spam       1.00      0.75      0.86       218

    accuracy                           0.97      1551
   macro avg       0.98      0.88      0.92      1551
weighted avg       0.97      0.97      0.96      1551


Random Forest para bigramas:

              precision    recall  f1-score   support

         ham       0.93      1.00      0.96      1333
        spam       1.00      0.52      0.68       218

    accuracy                           0.93      1551
   macro avg       0.96      0.76      0.82      1551
weighted avg       0.94      0.93      0.92      1551


Random Forest para tfidf:

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1333
        spam       1.00      0.76      0.86       218

    accuracy                           0.97      1551
   macro avg       0.98      0.88     

In [22]:
unig_nb = MultinomialNB().fit(X_train_unig,y_train_unig)
bi_nb= MultinomialNB().fit(X_train_bi,y_train_bi)
tfidf_nb = MultinomialNB().fit(X_train_tfidf,y_train_tfidf)

In [23]:
unig_nb_pred = unig_nb.predict(X_test_unig)
bi_nb_pred = bi_nb.predict(X_test_bi)
tfidf_nb_pred = tfidf_nb.predict(X_test_tfidf)

In [24]:
print('Naive Bayes para unigramas:\n')
print(classification_report(y_test_unig, unig_nb_pred, target_names = ['ham','spam']))
print('\nNaive Bayes para bigramas:\n')
print(classification_report(y_test_bi, bi_nb_pred, target_names = ['ham','spam']))
print('\nNaive Bayes para tfidf:\n')
print(classification_report(y_test_tfidf, tfidf_nb_pred, target_names = ['ham','spam'], zero_division=0))

Naive Bayes para unigramas:

              precision    recall  f1-score   support

         ham       0.99      0.98      0.99      1333
        spam       0.90      0.94      0.92       218

    accuracy                           0.98      1551
   macro avg       0.94      0.96      0.95      1551
weighted avg       0.98      0.98      0.98      1551


Naive Bayes para bigramas:

              precision    recall  f1-score   support

         ham       0.99      0.81      0.89      1333
        spam       0.45      0.96      0.61       218

    accuracy                           0.83      1551
   macro avg       0.72      0.88      0.75      1551
weighted avg       0.92      0.83      0.85      1551


Naive Bayes para tfidf:

              precision    recall  f1-score   support

         ham       0.86      1.00      0.92      1333
        spam       0.00      0.00      0.00       218

    accuracy                           0.86      1551
   macro avg       0.43      0.50      0.46 

# Conclusões

Todos os modelos tiveram uma boa performance quando utilizado a matriz de unigramas

o Random Forest conseguiu performar bem em todos os testes, já os outros modelos só perfomaram bem com unigramas, no entanto a performance com unigramas do Naive Bayes foi a melhor de todas, com uma acurácia de 98%, e 99% de acerto para mensagens do tipo ham que são a maioria esmagadora do conjunto.

Apesar de ter uma grande diferença de números entre ham e spam, conseguimos modelos não enviesados