# SPAM DETECTION

This model will predict if the mail is spam or not by using Natural Language Processing(NLP). 

In [40]:
import numpy as np 
import pandas as pd

In [58]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


## Importing dataset

In [4]:
dt=pd.read_csv("spam.csv")
dt.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [5]:
dt['spam'] = dt['type'].map( {'spam': 1, 'ham': 0} ).astype(int)
dt.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
print("COLUMS IN THE GIVEN DATA:")
for col in dt.columns: 
    print(col) 

COLUMS IN THE GIVEN DATA:
type
text
spam


In [7]:
t=len(dt['type'])
print("NO OF ROWS IN REVIEW COLUMN:",t)
t=len(dt['text'])
print("NO OF ROWS IN liked COLUMN:",t)

NO OF ROWS IN REVIEW COLUMN: 116
NO OF ROWS IN liked COLUMN: 116


## Pre-processing dataset

### 1) Tokenization

In [10]:
dt['text'][10]

"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [11]:
def tokenizer(text):
    return text.split()

In [12]:
dt['text']=dt['text'].apply(tokenizer)

In [13]:
dt['text'][10]

["I'm",
 'gonna',
 'be',
 'home',
 'soon',
 'and',
 'i',
 "don't",
 'want',
 'to',
 'talk',
 'about',
 'this',
 'stuff',
 'anymore',
 'tonight,',
 'k?',
 "I've",
 'cried',
 'enough',
 'today.']

### 2) Stemming

In [15]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english", ignore_stopwords=False)

In [16]:
def stemmer(text):
    return [porter.stem(word) for word in text]

In [17]:
dt['text']=dt['text'].apply(stemmer)

In [18]:
dt['text'][10]

["i'm",
 'gonna',
 'be',
 'home',
 'soon',
 'and',
 'i',
 "don't",
 'want',
 'to',
 'talk',
 'about',
 'this',
 'stuff',
 'anymor',
 'tonight,',
 'k?',
 "i'v",
 'cri',
 'enough',
 'today.']

### 3) Lemmitization 

In [19]:
dt['text'][50]

['what',
 'you',
 'think',
 'about',
 'me.',
 'first',
 'time',
 'you',
 'saw',
 'me',
 'in',
 'class.']

In [20]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [22]:
def lemmitize_it(text):
    return [lemmatizer.lemmatize(word, pos = "a") for word in text]

In [23]:
dt['text']=dt['text'].apply(lemmitize_it)

In [24]:
dt['text'][50]

['what',
 'you',
 'think',
 'about',
 'me.',
 'first',
 'time',
 'you',
 'saw',
 'me',
 'in',
 'class.']

### 4)Stopword Removal

In [25]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [26]:
def stop_it(text):
    review = [word for word in text if not word in stop_words ] 
    return review

In [27]:
dt['text']=dt['text'].apply(stop_it)

In [28]:
dt['text'][2]

['free',
 'entri',
 '2',
 'wkli',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005.',
 'text',
 'fa',
 '87121',
 'receiv',
 'entri',
 'question(std',
 'txt',
 'rate)t&c',
 'appli',
 '08452810075over18']

In [29]:
dt.head(10)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [30]:
dt['text']=dt['text'].apply(' '.join)

In [31]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


## Vectorization

### Transform Text Data into TDF /TF-IDF Vectors 

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf = TfidfVectorizer()
y = dt.spam.values
x = tfidf.fit_transform(dt['text'])

In [36]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

## Classification  

### Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)

from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_test)*100
print("accuracy:",acc_log )

accuracy: 87.5


###  Linear SVC Accuracy 

In [94]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_test)
acc_linear_svc =accuracy_score(y_pred, y_test) * 100
print("accuracy:",acc_linear_svc)


accuracy: 87.5


## Random Forest 

In [39]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 6, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
acc_logreg2 = round(accuracy_score(y_pred, y_test) , 2)*100
print("Accuracy : ",acc_logreg2)

Accuracy :  88.0
