In [1]:
import nltk
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("spam.csv", encoding='Windows-1252')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# Preprocessing

In [3]:
messages = data['v2']
messages

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [6]:
corpus = []

In [7]:
len(messages)

5572

In [8]:
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
x = cv.fit_transform(corpus).toarray()

In [11]:
x.shape

(5572, 5000)

In [12]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
data['v1'].replace(['ham', 'spam'], [1, 0], inplace=True)
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,1,"Go until jurong point, crazy.. Available only ...",,,
1,1,Ok lar... Joking wif u oni...,,,
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,1,U dun say so early hor... U c already then say...,,,
4,1,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...,,,
5568,1,Will Ì_ b going to esplanade fr home?,,,
5569,1,"Pity, * was in mood for that. So...any other s...",,,
5570,1,The guy did some bitching but I acted like i'd...,,,


In [15]:
y = data['v1']
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: int64

# Model

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [19]:
from sklearn.naive_bayes import MultinomialNB
spam_model = MultinomialNB()
spam_model.fit(X_train, y_train)

In [23]:
y_pred = spam_model.predict(X_test)

In [24]:
from sklearn.metrics import confusion_matrix

confusion_m = confusion_matrix(y_test, y_pred)
confusion_m

array([[128,  10],
       [ 15, 962]], dtype=int64)

# Accuracy

In [25]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test) * 100

97.75784753363229