# IMPORT LIBRARY

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import streamlit as st
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


## DATA PROCESSING

In [2]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('mail_data.csv')

In [3]:
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [5]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5572, 2)

Label Encoding

__Splitting the data into training data & test data__

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [12]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


__Feature Extraction__

In [13]:
# transform the text data to feature vectors that can be used as input 


# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [16]:
print(X_train)

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: Message, Length: 4457, dtype: object


In [17]:
print(X_train_features)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

Training the Model

__MODELLING__

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neighbors import KNeighborsClassifier


#RANDOM FOREST
rf = RandomForestClassifier()
rf.fit(X_train_features, Y_train)
y_train_pred_rf = rf.predict(X_train_features)
y_test_pred_rf = rf.predict(X_test_features)

#DECISION TREE
dt = DecisionTreeClassifier()
dt.fit(X_train_features, Y_train)
y_train_pred_dt = dt.predict(X_train_features)
y_test_pred_dt = dt.predict(X_test_features)

#XGBOOST
xgb = XGBClassifier()
xgb.fit(X_train_features, Y_train)
y_train_pred_xgb = xgb.predict(X_train_features)
y_test_pred_xgb = xgb.predict(X_test_features)

#KNN
knn = KNeighborsClassifier()
knn.fit(X_train_features, Y_train)
y_train_pred_knn = knn.predict(X_train_features)
y_test_pred_knn = knn.predict(X_test_features)

In [19]:
# Import necessary metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Function to evaluate the model
def evaluate_model(y_train, y_train_pred, y_test, y_test_pred):
    print("Training Accuracy: ", accuracy_score(y_train, y_train_pred))
    print("Testing Accuracy: ", accuracy_score(y_test, y_test_pred))
    print("Training Precision: ", precision_score(y_train, y_train_pred, average='weighted'))
    print("Testing Precision: ", precision_score(y_test, y_test_pred, average='weighted'))
    print("Training Recall: ", recall_score(y_train, y_train_pred, average='weighted'))
    print("Testing Recall: ", recall_score(y_test, y_test_pred, average='weighted'))
    print("Training F1 Score: ", f1_score(y_train, y_train_pred, average='weighted'))
    print("Testing F1 Score: ", f1_score(y_test, y_test_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_test, y_test_pred))

# Random Forest Evaluation
print("Random Forest Evaluation")
evaluate_model(Y_train, y_train_pred_rf, Y_test, y_test_pred_rf)

# Decision Tree Evaluation
print("\nDecision Tree Evaluation")
evaluate_model(Y_train, y_train_pred_dt, Y_test, y_test_pred_dt)

# XGBoost Evaluation
print("\nXGBoost Evaluation")
evaluate_model(Y_train, y_train_pred_xgb, Y_test, y_test_pred_xgb)

# KNN Evaluation
print("\nKNN Evaluation")
evaluate_model(Y_train, y_train_pred_knn, Y_test, y_test_pred_knn)

Random Forest Evaluation
Training Accuracy:  1.0
Testing Accuracy:  0.9811659192825112
Training Precision:  1.0
Testing Precision:  0.9815666444041599
Training Recall:  1.0
Testing Recall:  0.9811659192825112
Training F1 Score:  1.0
Testing F1 Score:  0.9805532505335786
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.86      0.92       149
           1       0.98      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Decision Tree Evaluation
Training Accuracy:  1.0
Testing Accuracy:  0.968609865470852
Training Precision:  1.0
Testing Precision:  0.9681264020552649
Training Recall:  1.0
Testing Recall:  0.968609865470852
Training F1 Score:  1.0
Testing F1 Score:  0.9682889817901293
Classification Report:
               precision    recall  f1-score   support

           0       0.90      

SAVE THE MODEL

In [22]:
import joblib
# Save TfidfVectorizer and feature vectors as joblib files
joblib.dump(feature_extraction, 'feature_extraction.joblib')

['feature_extraction.joblib']

In [23]:
joblib.dump(rf, 'random_forest_model.joblib')

['random_forest_model.joblib']