# A classifier model to accurately classify if an email is 'spam' or 'ham'

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data= pd.read_csv('SMSSpamCollection',sep='\t',names=['label','text'] )

In [3]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Text cleaning

In [4]:
# a function to clean text using the 're'library

def text_cleanup(text):
    text=text.lower()
    #removes numbers
    text = re.sub(r"\b\d+\b","",text)
    #replacing some common English contraction:
    text = re.sub(r"what's","what is",text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'don't", " do not ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    return text

In [5]:
data['text']= data['text'].apply(lambda x: text_cleanup(x))

In [6]:
data.head()

Unnamed: 0,label,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in a wkly comp to win fa cup final...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i do not think he goes to usf, he lives a..."


### Remove punctuation

In [7]:
def remove_punctuation(text):
    no_punctuation= [words for words in text if words not in string.punctuation]
    text_no_punct= ''.join(no_punctuation)
    return text_no_punct


In [8]:
data['text']= data['text'].apply(lambda x: remove_punctuation(x))

In [9]:
data.head()

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final...
3,ham,u dun say so early hor u c already then say
4,ham,nah i do not think he goes to usf he lives ar...


### Tokenization

In [10]:
# splitting the sentences into individual words

def tokenization(text):
    split= re.split("\W+",text)
    return split

In [11]:
data['text']=data['text'].apply(lambda x: tokenization(x))

In [12]:
data.head()

Unnamed: 0,label,text
0,ham,"[go, until, jurong, point, crazy, available, o..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,ham,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"[nah, i, do, not, think, he, goes, to, usf, he..."


### Remove stopwords

In [13]:
stopword = nltk.corpus.stopwords.words('english')

In [14]:
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text

In [15]:
data['text']=data['text'].apply(lambda x: remove_stopwords(x))

In [16]:
data['text'].head()

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, wkly, comp, win, fa, cup, final,...
3        [u, dun, say, early, hor, u, c, already, say]
4       [nah, think, goes, usf, lives, around, though]
Name: text, dtype: object

## Lemmatization

In [17]:
lemmatizer=nltk.stem.WordNetLemmatizer()

In [18]:
def lemmatize_text(text):
    lemmatized_text= ' '.join([lemmatizer.lemmatize(w) for w in text ])
    return lemmatized_text

In [19]:
data['text']=data['text'].apply(lambda x: lemmatize_text(x))

In [20]:
data['text'].head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts 21s...
3                  u dun say early hor u c already say
4                  nah think go usf life around though
Name: text, dtype: object

In [21]:
# converted the target variable into binary form using label encoder
le= preprocessing.LabelEncoder()
data['label']=le.fit_transform(data['label'])

In [22]:
x=data['text']
y=data['label']

In [23]:
data['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

In [24]:
x.shape

(5572,)

In [25]:
y.shape

(5572,)

## Vectorizer

In [26]:
vectorizer= TfidfVectorizer(lowercase=False)
x= vectorizer.fit_transform(x).toarray()

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=.2,random_state=5)

# Naive_Bayes Model

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [30]:
spam_detect_model = MultinomialNB().fit(x_train, y_train)
y_pred=spam_detect_model.predict(x_test)

In [31]:
y_pred[0:15]

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [49]:
print('Accuracy of the spam_detector_model(Naive_Bayes): {} '.format(accuracy_score(y_test,y_pred)))

Accuracy of the spam_detector_model(Naive_Bayes): 0.9614349775784753 


# LogisticRegression Model

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
spam_detect_model_LR= LogisticRegression()

In [41]:
spam_detect_model_LR.fit(x_train,y_train)

LogisticRegression()

In [42]:
y_pred_LR=spam_detect_model_LR.predict(x_test)

In [48]:
print('Accuracy of the spam_detector_model(LogisticRregression): {} '.format(accuracy_score(y_test,y_pred_LR)))

Accuracy of the spam_detector_model(LogisticRregression): 0.9533632286995516 


From my observation, the Naive_Bayes Model performed better than the LogisticRegression Model