In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df.shape

(5572, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
df.loc[df['Category'] == 'ham','Category'] = 1
df.loc[df['Category'] == 'spam','Category'] = 0

In [7]:
df

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [8]:
X = df['Message']
Y = df['Category']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 3)
X_train

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object

In [9]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)   

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [10]:
X_train_dense = X_train_features.toarray()
X_test_dense = X_test_features.toarray()

In [11]:
model = GaussianNB()

In [12]:
model.fit(X_train_dense,Y_train)

In [13]:
prediction_train = model.predict(X_train_dense)
accuracy_train = accuracy_score(Y_train,prediction_train)

In [14]:
print(accuracy_train)

0.934709445815571


In [15]:
prediction_test = model.predict(X_test_dense)
accuracy_test = accuracy_score(Y_test,prediction_test)

In [16]:
print(accuracy_test)

0.8878923766816144


In [17]:
model2 = LogisticRegression()

In [18]:
model2.fit(X_train_features,Y_train)
prediction_train = model2.predict(X_train_features)
accuracy_train = accuracy_score(Y_train,prediction_train)
print(accuracy_train)
prediction_test = model2.predict(X_test_features)
accuracy_test = accuracy_score(Y_test,prediction_test)
print(accuracy_test)

0.9670181736594121
0.9659192825112107


In [21]:
model3 = MultinomialNB()

In [22]:
model3.fit(X_train_features,Y_train)
prediction_train = model3.predict(X_train_features)
accuracy_train = accuracy_score(Y_train,prediction_train)
print(accuracy_train)
prediction_test = model3.predict(X_test_features)
accuracy_test = accuracy_score(Y_test,prediction_test)
print(accuracy_test)

0.9807045097599282
0.9730941704035875


In [31]:
input_mail = ['WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.']
input_data = feature_extraction.transform(input_mail)
prediction = model3.predict(input_data)
print(prediction)
print('NOT SPAM' if prediction[0] == 1 else 'SPAM MAIL')

[0]
SPAM MAIL
