### Spam Email Classifier

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
msgs = pd.read_csv("emails.csv")

In [3]:
msgs.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
msgs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
msgs.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
msgs.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [7]:
msgs.shape

(5572, 2)

### Label Encoding

In [8]:
# msgs.loc[msgs['Category'] == 'ham', 'Category',] = 0
# msgs.loc[msgs['Category'] == 'spam', 'Category',] = 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(msgs['Category'])
y = le.transform(msgs['Category'])

### 0 for ham, 1 for spam

### Features(x) and labels(y)

In [9]:
x = msgs['Message']

In [10]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [11]:
y

array([0, 0, 1, ..., 0, 0, 0])

### plitting into train_test

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=3)

In [18]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(3900,)
(1672,)


In [19]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(5572,)
(3900,)
(1672,)


### Removing stopwords and punctuations

In [20]:
# create a function for removing punctuation and stopwrods
import string
from nltk.corpus import stopwords

def text_processing(mess):
    """
    1. remove punctuations
    2. remove stopwords
    3. return list of clean words
    """
    nopunc = [char for char in mess if char not in string.punctuation]
    
    nopunc = ''.join(nopunc)
    
    clean_message = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean_message

In [21]:
msgs['Message'].apply(text_processing)

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, £750, Pou...
5568                   [ü, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: Message, Length: 5572, dtype: object

----------------------------------------------

### Text transformation into Vectors

In [24]:
vectorizer = TfidfVectorizer()

In [25]:
x_train_features = vectorizer.fit_transform(x_train)
x_test_features = vectorizer.transform(x_test)

In [26]:
print(x_train_features)

  (0, 3775)	0.33483588036026024
  (0, 3193)	0.25574937190375646
  (0, 2923)	0.2562178155770592
  (0, 5694)	0.37649307209075566
  (0, 4316)	0.18286966923588044
  (0, 1852)	0.29988532907982507
  (0, 2569)	0.4688491076477821
  (0, 2021)	0.42022774069926544
  (0, 837)	0.23049528677866618
  (0, 1497)	0.20535614453072223
  (1, 4383)	0.2146606176403117
  (1, 4420)	0.2805841679498683
  (1, 3946)	0.20524824226388863
  (1, 7065)	0.21934258772468185
  (1, 6)	0.2805841679498683
  (1, 6360)	0.13932469195433209
  (1, 3711)	0.17156117804480228
  (1, 2116)	0.15908360317248268
  (1, 3795)	0.13741825116932593
  (1, 3457)	0.10389326687862002
  (1, 2707)	0.10309077905812075
  (1, 1192)	0.26731816994369884
  (1, 4001)	0.16379627994781112
  (1, 4550)	0.11207359058156051
  (1, 3655)	0.2805841679498683
  :	:
  (3898, 5837)	0.2066600421117241
  (3898, 2354)	0.31883726651760114
  (3898, 6630)	0.17778488496740757
  (3898, 5971)	0.2103563154628509
  (3898, 4272)	0.22601328943158244
  (3898, 3343)	0.13363520999013

### Model training

In [27]:
import time

##### MultinomialNB Classifier

In [32]:
start_time = time.time() # calculating training time
# training model
mnb_model = MultinomialNB()
mnb_model.fit(x_train_features, y_train)

end_time = time.time()
training_time_multinomialNB = end_time - start_time
print("Training time for MultinomialNB: {:.2f} seconds".format(training_time_multinomialNB))

Training time for MultinomialNB: 0.01 seconds


##### DecisionTree Classifier

In [36]:
start_time = time.time()
# training model
dt_model = DecisionTreeClassifier(max_depth=12)
dt_model.fit(x_train_features, y_train)

end_time = time.time()
training_time_decision_tree = end_time - start_time
print("Training time for Decision Tree: {:.2f} seconds".format(training_time_decision_tree))

Training time for Decision Tree: 0.26 seconds


##### Logistic Regression Classifier

In [37]:
start_time = time.time()
# training model
lr_model = LogisticRegression()
lr_model.fit(x_train_features, y_train)

end_time = time.time()
training_time_logistic_regression = end_time - start_time
print("Training time for Logistic Regression: {:.2f} seconds".format(training_time_logistic_regression))

Training time for Logistic Regression: 0.12 seconds


### Predictions on Training Data USING MULTINOMIAL NB

In [38]:
mnb_pred_on_train_data = mnb_model.predict(x_train_features)

In [39]:
mnb_pred_on_train_data

array([0, 0, 0, ..., 0, 0, 1])

In [40]:
print("Accuracy on training data : ", round(accuracy_score(y_train, mnb_pred_on_train_data),2))

Accuracy on training data :  0.97


In [41]:
print(confusion_matrix(y_train, mnb_pred_on_train_data))

[[3385    0]
 [ 116  399]]


### Predictions on Testing data USING MultinomialNB

In [45]:
mnb_pred_on_test_data = mnb_model.predict(x_test_features)

In [46]:
mnb_pred_on_test_data

array([1, 0, 1, ..., 0, 0, 0])

In [47]:
print("Accuracy on testing data : ", round(accuracy_score(y_test, mnb_pred_on_test_data),2))

Accuracy on testing data :  0.95


In [48]:
print(confusion_matrix(y_test, mnb_pred_on_test_data))

[[1440    0]
 [  79  153]]


#### So the prediction of MultinomialNB on Training data is 97% and test data is 95%

--------------------------

### Predictions on Training Data USING Decision Tree

In [49]:
dt_pred_on_train_data = dt_model.predict(x_train_features)
dt_pred_on_train_data 

array([0, 1, 0, ..., 0, 0, 1])

In [50]:
print("Accuracy on training data : ", round(accuracy_score(y_train, dt_pred_on_train_data),2))

Accuracy on training data :  0.98


In [51]:
print(confusion_matrix(y_train, dt_pred_on_train_data))

[[3380    5]
 [  58  457]]


### Predictions on Testing Data USING Decision Tree

In [54]:
dt_pred_on_test_data = dt_model.predict(x_test_features)
dt_pred_on_test_data

array([1, 0, 1, ..., 0, 0, 0])

In [55]:
print("Accuracy on testing data : ", round(accuracy_score(y_test, dt_pred_on_test_data),2))

Accuracy on testing data :  0.96


In [56]:
print(confusion_matrix(y_test, dt_pred_on_test_data))

[[1422   18]
 [  47  185]]


#### So the prediction of DecisionTree on Training data is 98% and test data is 96% when depth is 12. if we dont specify depth or keep the depth 15 or more then 15 it leads to overfitting

-------------------------------------

### Predictions on Training Data USING Logistic Regression

In [57]:
lr_pred_on_train_data = lr_model.predict(x_train_features)
lr_pred_on_train_data 

array([0, 0, 0, ..., 0, 0, 1])

In [58]:
print("Accuracy on training data : ", round(accuracy_score(y_train, lr_pred_on_train_data),2))

Accuracy on training data :  0.97


In [59]:
print(confusion_matrix(y_train, lr_pred_on_train_data))

[[3384    1]
 [ 112  403]]


### Predictions on Testing Data USING Logistic Regression

In [60]:
lr_pred_on_test_data = lr_model.predict(x_test_features)
lr_pred_on_test_data

array([1, 0, 1, ..., 0, 0, 0])

In [61]:
print("Accuracy on testing data : ", round(accuracy_score(y_test, lr_pred_on_test_data),2))

Accuracy on testing data :  0.97


In [62]:
print(confusion_matrix(y_test, lr_pred_on_test_data))

[[1439    1]
 [  44  188]]


#### So the prediction of Logistic Regression on Training data is 97% and test data is 97%

----------------------------------------------

## Entering two sample messages for testing the model

In [64]:
mail_1 = ["Congratulations! you have been awarded with scholarship. Kindly meet the director of ehsas scholarships for further details in university tomorrow."]
mail_2 = ["Hi John How are you? and Congratulation for winning under 19 championship"]
vector_1= vectorizer.transform(mail_1)
vector_2 = vectorizer.transform(mail_2)

#### Testing MultinomialNB

In [65]:
pred_1 = mnb_model.predict(vector_1)
pred_2 = mnb_model.predict(vector_2)

# for pred_1
if pred_1 == 0:
    print("mail_1 is a Ham mail")
if pred_1 == 1:
    print("mail_1 is a Spam mail")
    
# for pred_2
if pred_2 == 0:
    print("mail_2 is a Ham mail")
if pred_2 == 1:
    print("mail_2 is a Spam mail")

mail_1 is a Ham mail
mail_2 is a Ham mail


--------------------------------------------------------

#### Testing Decision Tree

In [66]:
pred_1 = dt_model.predict(vector_1)
pred_2 = dt_model.predict(vector_2)

# for pred_1
if pred_1 == 0:
    print("mail_1 is a Ham mail")
if pred_1 == 1:
    print("mail_1 is a Spam mail")
    
# for pred_2
if pred_2 == 0:
    print("mail_2 is a Ham mail")
if pred_2 == 1:
    print("mail_2 is a Spam mail")

mail_1 is a Ham mail
mail_2 is a Ham mail


-----------------------------------------------------

#### Testing Logistic Regression

In [67]:
pred_1 = lr_model.predict(vector_1)
pred_2 = lr_model.predict(vector_2)

# for pred_1
if pred_1 == 0:
    print("mail_1 is a Ham mail")
if pred_1 == 1:
    print("mail_1 is a Spam mail")
    
# for pred_2
if pred_2 == 0:
    print("mail_2 is a Ham mail")
if pred_2 == 1:
    print("mail_2 is a Spam mail")

mail_1 is a Ham mail
mail_2 is a Ham mail


-----------------------------------------------------------------

# Comparison 

### Accuracy on training data: 
#### Multinomial 97%, Decision Tree 98%, logistic regression 97%

### Accuracy on testing data: 
#### Multinomial 95%, Decision Tree 96%, logistic regression 97% 

### Training time: 
#### Multinomial 0.01 seconds, Decision Tree 0.26 seconds, logistic regression 0.12 seconds 