In [1]:
import nltk

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [5]:
data = pd.read_csv('SMSSpamCollection', sep='\t', names = ['label', 'message'])

In [6]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [9]:
data['target'] = np.where(data['label'] == 'spam', 1,0)

In [11]:
data.head(20)

Unnamed: 0,label,message,target
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [12]:
data.drop(columns='label', axis = 1, inplace=True)

In [13]:
data.head()

Unnamed: 0,message,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [14]:
data.shape

(5572, 2)

In [15]:
from nltk.corpus import stopwords

In [16]:
from nltk.stem import PorterStemmer

In [17]:
import re

In [18]:
len(data['message'])

5572

In [19]:
corpus = []
ps = PorterStemmer()

In [20]:
data['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [21]:
for i in range(len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    review = review.lower()
    review = review.split()
    
    words = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    
    words = ' '.join(words)
    corpus.append(words)
    

In [23]:
len(corpus)

5572

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
cv = TfidfVectorizer(max_features=2500)

In [34]:
X = cv.fit_transform(corpus).toarray()

In [35]:
X.shape

(5572, 2500)

In [36]:
y = data.target

In [39]:
y.shape

(5572,)

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [42]:
from sklearn.naive_bayes import MultinomialNB

In [43]:
spam_model = MultinomialNB()

In [44]:
spam_model.fit(X_train, y_train)

MultinomialNB()

In [45]:
y_pred = spam_model.predict(X_test)

In [48]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [49]:
confusion_matrix(y_test, y_pred)

array([[957,   2],
       [ 24, 132]], dtype=int64)

In [50]:
accuracy_score(y_test, y_pred)

0.9766816143497757

In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       959
           1       0.99      0.85      0.91       156

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

