## Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Reading Data

In [2]:
data = pd.read_csv("SPAM-210331-134237.csv")
data['spam'] = data['type'].map( {'spam':1,'ham':0} ).astype(int)
data

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
111,ham,What is the plural of the noun research?,0
112,ham,Going for dinner.msg you after.,0
113,ham,I'm ok wif it cos i like 2 try new things. But...,0
114,spam,GENT! We are trying to contact you. Last weeke...,1


## Tokenization

In [3]:
data['text'][1]

'Ok lar... Joking wif u oni...'

In [4]:
data['text'][1].split()

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [5]:
def tokenization(text):
    return text.split()

data['text'] = data['text'].apply(tokenization)

## Stemming

In [6]:
from nltk.stem.snowball import SnowballStemmer
SS = SnowballStemmer("english",ignore_stopwords = False)

In [7]:
data['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [8]:
def Stemming(text):
    return [SS.stem(word) for word in text]

data['text'] = data['text'].apply(Stemming)

In [9]:
data['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

## Lemmitization

In [10]:
from nltk.stem import WordNetLemmatizer
LM = WordNetLemmatizer()

In [11]:
def Lemmatization(text):
    return [LM.lemmatize(word,pos = "a") for word in text]

data['text'] = data['text'].apply(Lemmatization)

## Stopword Removal

In [12]:
from nltk.corpus import stopwords
SW = stopwords.words("english") 

In [13]:
def StopRemoval(text):
    rev = [word for word in text if not word in SW]
    return (rev)

data['text'] = data['text'].apply(StopRemoval)

In [14]:
data.head(10)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [15]:
data['text'] = data['text'].apply(' '.join)

In [16]:
data.head(10)

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0
5,spam,freemsg hey darl 3 week word back! i'd like fu...,1
6,ham,even brother like speak me. treat like aid pat...,0
7,ham,per request mell mell (oru minnaminungint nuru...,0
8,spam,winner!! valu network custom select receivea £...,1
9,spam,mobil 11 month more? u r entitl updat late col...,1


## Transforming into TDF/TF-IDF vectors

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y = data.spam.values
x = tfidf.fit_transform(data['text'])

In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=10,test_size=0.2)

## Applying Logistics Regression


In [19]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(x_train,y_train)
y_pred = LR.predict(x_test)

from sklearn.metrics import accuracy_score
Acc_log = accuracy_score(y_pred, y_test) * 100
print("Accuracy using Logistics:",Acc_log)

Accuracy using Logistics: 83.33333333333334


## Applying LinearSVC

In [20]:
from sklearn.svm import LinearSVC

LC = LinearSVC(random_state=0)
LC.fit(x_train, y_train)
y_pred = LC.predict(x_test)
Acc_lvc = accuracy_score(y_pred, y_test) * 100
print("Accuracy using SVC:",Acc_lvc)

Accuracy using SVC: 83.33333333333334


## Applying KNN

In [21]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
Acc_knn = accuracy_score(y_pred,y_test) * 100
print("Accuracy using Knn:",Acc_knn)

Accuracy using Knn: 87.5
