## **Import Dependencies**

In [1]:
import pandas as pd
import numpy as np
import joblib
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer #Used to convert textual data into numerical values to be used by model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score

### **Data Loading and Preprocessing**

In [2]:
df = pd.read_csv('./mail_data.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [6]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [8]:
df.shape

(5572, 2)

In [9]:
# Label encoding spam=0 and ham=1
df.loc[df['Category']=='spam', 'Category'] = 0
df.loc[df['Category']=='ham', 'Category'] = 1

In [10]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#separating dataset into text (i.e. input variable X) and labels (i.e. target variable Y)
X = df['Message']
Y = df['Category']

In [12]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [13]:
Y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: object

### **Splitting data into train and test**

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=8)

In [15]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


### **Feature Extraction**

In [16]:
#transform textual data into feature vectors to be used as input for the model
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#convert Y_train and Y_test to integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [17]:
print(X_train_features)
print(X_test_features)

  (0, 3918)	0.326670145310277
  (0, 3486)	0.25080812644381334
  (0, 1597)	0.3802796623931952
  (0, 2454)	0.4052066682952275
  (0, 1889)	0.3911685065737249
  (0, 7096)	0.2108564302218432
  (0, 5810)	0.3911685065737249
  (0, 1666)	0.3151349673711989
  (0, 6587)	0.2679193000060343
  (1, 5228)	0.3796284575636846
  (1, 6073)	0.5414291410131468
  (1, 2484)	0.4479255811687707
  (1, 2378)	0.41215707891575754
  (1, 2906)	0.34650574698488357
  (1, 7096)	0.268625581079466
  (2, 3610)	0.5678135062600733
  (2, 3148)	0.31002553289736334
  (2, 4105)	0.31002553289736334
  (2, 3968)	0.3144749831062941
  (2, 1851)	0.3128261358200171
  (2, 4013)	0.3073262662946963
  (2, 1949)	0.440626848711727
  (3, 2573)	0.23268529721595982
  (3, 3152)	0.14414068195988233
  (3, 4651)	0.22185254614327257
  :	:
  (4455, 4398)	0.3225032158948276
  (4455, 754)	0.25758822353899113
  (4455, 5892)	0.26513334808504574
  (4455, 7364)	0.29817848067874686
  (4455, 1524)	0.267256049872684
  (4455, 4312)	0.331737702018799
  (4455, 7

### **Training Model**

In [18]:
#Logistic Regression
model = LogisticRegression()
model.fit(X_train_features, Y_train)
print('Accuracy Score on training data:  ', accuracy_score(Y_train, model.predict(X_train_features)))
print('Accuracy Score on test data:  ', accuracy_score(Y_test, model.predict(X_test_features)))

Accuracy Score on training data:   0.9681400044873233
Accuracy Score on test data:   0.9632286995515695


In [19]:
#Support Vector Machine
model = SVC(C=1.0, kernel='rbf', gamma='auto')
model.fit(X_train_features, Y_train)
print('Accuracy Score on training data:  ', accuracy_score(Y_train, model.predict(X_train_features)))
print('Accuracy Score on test data:  ', accuracy_score(Y_test, model.predict(X_test_features)))

Accuracy Score on training data:   0.8671752299753197
Accuracy Score on test data:   0.8609865470852018


In [20]:
#Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train_features, Y_train)
print('Accuracy Score on training data:  ', accuracy_score(Y_train, model.predict(X_train_features)))
print('Accuracy Score on test data:  ', accuracy_score(Y_test, model.predict(X_test_features)))

Accuracy Score on training data:   0.9831725375813327
Accuracy Score on test data:   0.9704035874439462


In [21]:
#Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, criterion='gini')
model.fit(X_train_features, Y_train)
print('Accuracy Score on training data:  ', accuracy_score(Y_train, model.predict(X_train_features)))
print('Accuracy Score on test data:  ', accuracy_score(Y_test, model.predict(X_test_features)))

Accuracy Score on training data:   1.0
Accuracy Score on test data:   0.9766816143497757


### **Saving The Model**

In [22]:
# Saving Random Forest as the best performing model
#pickle.dump(model, "RandomForestModel.joblib")
# Saving the vectorizer
pickle.dump(model, open('RandomForestModel.pickle', 'wb'))

In [23]:
# Saving the vectorizer
pickle.dump(feature_extraction, open('Vectorizer.pickle', 'wb'))