In [60]:
# Natural languafe toolkit

# Pandas - biblioteca para manipulação de dados
import pandas as pd

In [61]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
4580,ham,Not course. Only maths one day one chapter wit...,,,
2308,spam,Moby Pub Quiz.Win a å£100 High Street prize if...,,,
2684,ham,Yupz... I've oredi booked slots 4 my weekends ...,,,
4720,ham,"Yup. Anything lor, if u dun wan it's ok...",,,
113,spam,GENT! We are trying to contact you. Last weeke...,,,


In [62]:
df = df[['v1', 'v2']]

In [63]:
df.columns = ['Spam ou nao', 'Mensagem']

In [64]:
df.sample(5)

Unnamed: 0,Spam ou nao,Mensagem
4904,spam,Warner Village 83118 C Colin Farrell in SWAT t...
3352,ham,I emailed yifeng my part oredi.. Can Ì_ get it...
1307,spam,"Get 3 Lions England tone, reply lionm 4 mono o..."
1330,ham,"Aight no rush, I'll ask jay"
3249,ham,"Babe, I need your advice"


In [65]:
msg = 'What is your record for one night? :)'

In [66]:
# bag of words
# 1 - separar as palavras
# 2 - contar a frequencia de cada palavra
# 3 - criar um vetor com a frequencia de cada palavra
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
vectorizer = CountVectorizer()
vectorizer.fit([msg])

In [68]:
print(vectorizer.get_feature_names_out())

['for' 'is' 'night' 'one' 'record' 'what' 'your']


In [69]:
vectorizer.transform([msg]).toarray()

array([[1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [70]:
pd.DataFrame(
    data = vectorizer.transform([
        msg,
        'You new Year is record for one night? :)',
        'What is your record for one night?',
    ]).toarray(), 
    columns=vectorizer.get_feature_names_out()
)

Unnamed: 0,for,is,night,one,record,what,your
0,1,1,1,1,1,1,1
1,1,1,1,1,1,0,0
2,1,1,1,1,1,1,1


In [71]:
def processa_spam(msg):
      """ `msg`: string 
      
      `return`: `0` se for ham, `1` se for spam
      """
      if msg == 'ham':
        return 0
      else:
        return 1
      
      
def tamanho_msg(msg):
  return len(msg)


df['Spam (Num)'] = df['Spam ou nao'].apply(processa_spam)
df['Tamanho'] = df['Mensagem'].apply(tamanho_msg)

df.sample(5)

Unnamed: 0,Spam ou nao,Mensagem,Spam (Num),Tamanho
5557,ham,No. I meant the calculation is the same. That ...,0,273
4046,spam,"Thanks for your ringtone order, reference numb...",1,156
773,ham,I wil be there with in &lt;#&gt; minutes. Go...,0,57
4136,ham,Just got to &lt;#&gt;,0,22
3518,ham,Hanging out with my brother and his family,0,42


In [72]:
# stopwords
# palavras que nao tem significado
# ex: a, o, e, de, da, do, ...
# ex em ingles: the, a, an, of, some, so, ...

# em bag of words, as stopwords podem atrapalhar o modelo
# pois elas aparecem com muita frequencia
# e nao tem significado

# Lemmatization stemmatization
# ex: running, run, ran, runs
# ex: correr, correu, correndo, corri

# podemos usar a biblioteca spacy para fazer isso
# ! pip install spacy
# ! python -m spacy download en_core_web_sm
# import spacy

# para remover as stopwords, podemos usar a biblioteca nltk
# ! pip install nltk
from nltk.corpus import stopwords
# ! python -m nltk.downloader stopwords

def processa_mensagem(linha):
    msg = linha['Mensagem']
    msg = msg.lower()

    msg_sem_stopwords = " ".join([palavra for palavra in msg.split(" ") if palavra not in stopwords.words('english')])

    return msg_sem_stopwords

df['Mensagem Processada'] = df.apply(processa_mensagem, axis=1)
df

Unnamed: 0,Spam ou nao,Mensagem,Spam (Num),Tamanho,Mensagem Processada
0,ham,"Go until jurong point, crazy.. Available only ...",0,111,"go jurong point, crazy.. available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,0,29,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,155,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,49,u dun say early hor... u c already say...
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,61,"nah think goes usf, lives around though"
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,161,2nd time tried 2 contact u. u å£750 pound priz...
5568,ham,Will Ì_ b going to esplanade fr home?,0,37,ì_ b going esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s...",0,57,"pity, * mood that. so...any suggestions?"
5570,ham,The guy did some bitching but I acted like i'd...,0,125,guy bitching acted like i'd interested buying ...


In [73]:
X, y = df['Mensagem Processada'], df['Spam (Num)']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

print(vectorizer.get_feature_names_out()[1500:1510])

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

['brin' 'bring' 'bringing' 'brings' 'brisk' 'bristol' 'british' 'britney'
 'bro' 'broad']


In [77]:
# futuramente veremos o tf-idf
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [78]:
clf.predict(X_test)

array([0, 1, 0, ..., 1, 0, 0], dtype=int64)

In [79]:
score = clf.score(X_test, y_test)

print(f'Accuracy: {score*100:.2f}%')

Accuracy: 98.48%


In [80]:
clf.predict(vectorizer.transform(['Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.']))

array([1], dtype=int64)

In [81]:
clf.predict(vectorizer.transform(['I am going to the beach']))

array([0], dtype=int64)