# Problem Statement:
problem statement is to build model to predict a model which can identify a mail is spam or ham.

In [3]:
#Import necessary dependencies:

import numpy as np              # to create numpy array
import pandas as pd             # to create pandas dataframe
import nltk                     
import matplotlib.pyplot as plt

In [4]:
#Import dataset:

data_path=r"C:\Users\Abhinandan\Desktop\spam1.csv"

messages = pd.read_csv(data_path)

messages.shape

(6776, 5)

In [5]:
messages.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [6]:
#preview of dataset:

messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
messages.isnull().sum()*100/messages.shape[0]

v1             0.000000
v2             0.000000
Unnamed: 2    99.173554
Unnamed: 3    99.763872
Unnamed: 4    99.881936
dtype: float64

Removing the last 3 columns because OF more numbers of NULLS

In [8]:
messages = messages.iloc[:,[0,1]]

In [9]:
messages.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Renaming the columns for easy simplification of target variable and feature.

In [10]:
messages.rename(columns={"v1":"label", "v2":"message"}, inplace=True)

In [11]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
messages.label.value_counts()

ham     5854
spam     922
Name: label, dtype: int64

In [13]:
messages.label.replace({"ham":0,"spam":1},inplace=True)

In [14]:
messages.label.value_counts()

0    5854
1     922
Name: label, dtype: int64

In [15]:
messages.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Lets convert whole data in lower case

In [16]:
messages.message= messages.message.str.lower()

In [17]:
messages.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


# Remove stop words and punctuations

nltk = nat. lang. tool kit

In [18]:
from nltk.corpus import stopwords

In [19]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abhinandan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [21]:
import string

In [22]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

We are using a user defined function to remove punctuation and stop words

In [23]:
def text_process(mess):            ### creating a function
    """                                                        ## a docstring
    1. remove the punctuation
    2. remove the stopwords
    3. return the list of clean textwords
    
    """
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = "".join(nopunc)
    
    return [ word for word in nopunc.split() if word not in stopwords.words("english")]

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# Count vectorizer is used to get the count of each and every word.

In [25]:
bow_transformer =    CountVectorizer(analyzer = text_process   ).fit(messages["message"])

In [26]:
len(bow_transformer.vocabulary_)  # total number of unique words

9419

In [27]:
messages_bow = bow_transformer.transform(messages.message)   # creating the TDM

# this will act as my  X variables.

In [28]:
messages_bow.shape

(6776, 9419)

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
x_train, x_test, y_train, y_test = train_test_split(messages_bow, messages.label, test_size=.2)

# Naive Bayes

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
nb =MultinomialNB()

In [33]:
nb.fit(x_train, y_train)

MultinomialNB()

In [34]:
pred = nb.predict(x_test)

In [35]:
from sklearn.metrics import confusion_matrix

tab_nb = confusion_matrix(pred , y_test)

tab_nb

array([[1146,    9],
       [  18,  183]], dtype=int64)

In [36]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(pred , y_test)

acc*100

98.00884955752213

# Decision Tree

In [37]:
from sklearn.tree import DecisionTreeClassifier
dtc =DecisionTreeClassifier(criterion ="gini", max_depth= 3)

dtc.fit(x_train, y_train)
        
pred_dtc = dtc.predict(x_test)

In [38]:
tab_dtc= confusion_matrix(pred_dtc , y_test)

tab_dtc

array([[1087,   53],
       [  77,  139]], dtype=int64)

In [39]:
acc_dtc = accuracy_score(pred_dtc , y_test)

acc_dtc*100

90.41297935103245

# Logistic ReGRESSION

In [40]:
from sklearn.linear_model import LogisticRegression
logireg = LogisticRegression()

In [41]:
logireg.fit(x_train, y_train)
    
pred_logireg = logireg.predict(x_test)

In [42]:
tab_logireg= confusion_matrix(pred_logireg , y_test)

tab_logireg

array([[1161,   26],
       [   3,  166]], dtype=int64)

In [43]:
acc_logireg = accuracy_score(pred_logireg , y_test)

acc_logireg*100

97.86135693215338

# Random forest

In [44]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=140,class_weight='balanced')

In [45]:
rfc.fit(x_train, y_train)
        
pred_rfc = rfc.predict(x_test)

In [46]:
tab_rfc= confusion_matrix(pred_rfc , y_test)

tab_rfc

array([[1164,   42],
       [   0,  150]], dtype=int64)

In [47]:
acc_rfc = accuracy_score(pred_rfc , y_test)

acc_rfc*100

96.90265486725663

As we can see above Logistic Regression gives best result on the given dataset. i.e. 98.08

 # -------------------------------------------------The End-----------------------------------------------