In [128]:
import numpy as np
import pandas as pd

In [129]:
df = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])

In [130]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Text cleaning

In [131]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [132]:
lemmatizer = WordNetLemmatizer()

In [133]:
for i in range(len(df['message'])):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    df['message'][i] = ' '.join(review)

In [134]:
df

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though
...,...,...
5567,spam,nd time tried contact u u pound prize claim ea...
5568,ham,b going esplanade fr home
5569,ham,pity mood suggestion
5570,ham,guy bitching acted like interested buying some...


# Creating Bag of Words Model

In [135]:
from sklearn.feature_extraction.text import CountVectorizer

In [136]:
vectorizer = CountVectorizer(max_features=5000)

In [137]:
X = vectorizer.fit_transform(df['message']).toarray()

In [138]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [139]:
X.shape

(5572, 5000)

In [140]:
df['label'] = pd.get_dummies(df['label'])

In [141]:
y = df.iloc[:,0].values

In [142]:
y

array([1, 1, 0, ..., 1, 1, 1], dtype=uint8)

# Train Test Split

In [143]:
from sklearn.model_selection import train_test_split

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Training The Model

In [145]:
from sklearn.naive_bayes import MultinomialNB

In [146]:
model = MultinomialNB()

In [147]:
model.fit(X_train, y_train)

MultinomialNB()

In [148]:
y_pred = model.predict(X_test)

In [149]:
y_pred

array([1, 1, 1, ..., 1, 0, 1], dtype=uint8)

In [150]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [151]:
cm = confusion_matrix(y_test, y_pred)

In [152]:
cm

array([[132,   6],
       [ 16, 961]], dtype=int64)

In [153]:
accuracy = accuracy_score(y_test, y_pred)

In [154]:
accuracy

0.9802690582959641