In [95]:
from sklearn.neighbors import KNeighborsClassifier

In [96]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
df=pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [99]:
df['Message']=df['Message'].str.lower()

In [100]:
df.head()

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [101]:
y=df['Category']
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: Category, dtype: object

In [102]:
encoder=LabelEncoder()

In [103]:
y=encoder.fit_transform(y)
y

array([0, 0, 1, ..., 0, 0, 0])

In [104]:
import re

In [105]:
def remove_punctuation(text):
    return re.sub(r'/w/s','',text)

In [106]:
df['Message']=df['Message'].apply(lambda x: remove_punctuation(x))
df

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ü b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i'd...


In [107]:
Stopwords=stopwords.words("english")

In [108]:
print(Stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [109]:
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in Stopwords])

In [110]:
df['Message']=df['Message'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,Category,Message
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think goes usf, lives around though"


In [111]:
x=df['Message']
x

0       go jurong point, crazy.. available bugis n gre...
1                           ok lar... joking wif u oni...
2       free entry 2 wkly comp win fa cup final tkts 2...
3               u dun say early hor... u c already say...
4                 nah think goes usf, lives around though
                              ...                        
5567    2nd time tried 2 contact u. u £750 pound prize...
5568                         ü b going esplanade fr home?
5569             pity, * mood that. so...any suggestions?
5570    guy bitching acted like i'd interested buying ...
5571                                      rofl. true name
Name: Message, Length: 5572, dtype: object

In [112]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=2)

In [113]:
print(xtrain.shape,xtest.shape)

(4457,) (1115,)


In [114]:
featexact=TfidfVectorizer(min_df=1,stop_words="english")

In [115]:
xtrain_features=featexact.fit_transform(xtrain)
xtest_features=featexact.transform(xtest)

In [116]:
print(xtrain_features)

  (0, 4332)	0.42941702167641554
  (0, 3956)	0.6161071828926097
  (0, 6584)	0.44333254982109394
  (0, 6925)	0.48935591439341625
  (1, 2120)	0.3573617143022146
  (1, 1427)	0.5869421390016223
  (1, 6969)	0.42812434651556874
  (1, 3167)	0.5869421390016223
  (2, 5113)	0.3408491178137899
  (2, 7349)	0.31988118061968496
  (2, 3850)	0.3408491178137899
  (2, 4882)	0.35749230587184955
  (2, 5693)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5892)	0.35749230587184955
  (2, 1875)	0.28751725124107325
  (2, 6876)	0.35749230587184955
  (3, 197)	0.36080819973690814
  (3, 2434)	0.26375694689833057
  (3, 1824)	0.2653371482419713
  (3, 5229)	0.2239434291771947
  (3, 300)	0.2880726714985597
  (3, 7245)	0.23287012353693343
  (3, 5003)	0.3130726739894397
  (3, 2059)	0.24613918464731635
  :	:
  (4454, 7082)	0.27308082807957357
  (4454, 2243)	0.25661108723747245
  (4454, 666)	0.290981042280966
  (4454, 1574)	0.21271210403002053
  (4454, 1093)	0.2524837657613177
  (4454, 5066)	0.2263000795387692
  

In [117]:
model=LogisticRegression()

In [118]:
model.fit(xtrain_features,ytrain)

In [119]:
pred=model.predict(xtrain_features)

In [120]:
accuracy_score(pred,ytrain)

0.9681400044873233

In [121]:
pred=model.predict(xtest_features)

In [122]:
accuracy_score(pred,ytest)

0.9533632286995516

In [123]:
model2=SVC()

In [124]:
model2.fit(xtrain_features,ytrain)

In [125]:
pred=model2.predict(xtrain_features)

In [126]:
accuracy_score(pred,ytrain)

0.9984294368409243

In [127]:
pred=model2.predict(xtest_features)

In [128]:
accuracy_score(pred,ytest)

0.9721973094170404

In [141]:
knn=KNeighborsClassifier(n_neighbors=3)

In [142]:
knn.fit(xtrain_features,ytrain)

In [143]:
pred=knn.predict(xtest_features)

In [144]:
accuracy_score(pred,ytest)

0.9112107623318386