# EMAIL SPAM DETECTION WITH MACHINE LEARNING

In [2]:
import numpy as np
import pandas as pd

In [3]:
dt=pd.read_csv("spam.csv",encoding="ISO-8859-1")

In [4]:
dt.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 108.9+ KB


In [6]:
dt["spam"]=dt["v1"].map({"spam":1,"ham":0}).astype(int)

In [7]:
dt.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
0,ham,"Go until jurong point, crazy.. Available only ...",,,,0
1,ham,Ok lar... Joking wif u oni...,,,,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,ham,U dun say so early hor... U c already then say...,,,,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,0


# Tokenization

In [8]:
dt["v2"][1]

'Ok lar... Joking wif u oni...'

In [9]:
def tokenizer(text):
    return text.split()

In [10]:
dt["v2"]=dt["v2"].apply(tokenizer)

In [11]:
dt["v2"][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

# Stemming

In [12]:
from nltk.stem.snowball import SnowballStemmer
porter=SnowballStemmer("english")

In [13]:
def stem_te(text):
    return [porter.stem(word) for word in text]

In [14]:
dt["v2"]=dt["v2"].apply(stem_te)

In [15]:
dt["v2"][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

# Lemmitization

In [16]:
dt["v2"][40]

['pls',
 'go',
 'ahead',
 'with',
 'watts.',
 'i',
 'just',
 'want',
 'to',
 'be',
 'sure.',
 'do',
 'have',
 'a',
 'great',
 'weekend.',
 'abiola']

In [17]:
from nltk.stem import WordNetLemmatizer

In [18]:
lemmatizer=WordNetLemmatizer()

In [19]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word) for word in text]

In [20]:
dt["v2"]=dt["v2"].apply(lemmit_it)

In [21]:
dt["v2"][40]

['pls',
 'go',
 'ahead',
 'with',
 'watts.',
 'i',
 'just',
 'want',
 'to',
 'be',
 'sure.',
 'do',
 'have',
 'a',
 'great',
 'weekend.',
 'abiola']

# Stopword Removal

In [22]:
from nltk.corpus import stopwords
stop_words=stopwords.words("english")

In [23]:
def stop_it(text):
    review= [word for word in text if not word in stop_words]
    return review

In [24]:
dt["v2"]=dt["v2"].apply(stop_it)

In [25]:
dt.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",,,,0
1,ham,"[ok, lar..., joke, wif, u, oni...]",,,,0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",,,,1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",,,,0
4,ham,"[nah, think, goe, usf,, live, around, though]",,,,0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",,,,1
6,ham,"[even, brother, like, speak, me., treat, like,...",,,,0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",,,,0
8,spam,"[winner!!, valu, network, custom, select, rece...",,,,1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",,,,1


In [26]:
dt["v2"]=dt["v2"].str.join(" ")

In [27]:
dt.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",,,,0
1,ham,ok lar... joke wif u oni...,,,,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,,,,1
3,ham,u dun say earli hor... u c alreadi say...,,,,0
4,ham,"nah think goe usf, live around though",,,,0


# Transform Text Data into TF-IDF Vectors

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [29]:
y=dt.spam.values

In [30]:
x=tfidf.fit_transform(dt["v2"])

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_text,y_train,y_text=train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

# Classification using Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_text)
from sklearn.metrics import accuracy_score
acc_log=accuracy_score(y_pred,y_text)*100
print("accuracy:",acc_log)

accuracy: 95.69506726457399
