### Introduction

About the Dataset

The csv file contains 5172 rows, each row for each email. There are 3002 columns. The first column indicates Email name. The name has been set with numbers and not recipients' name to protect privacy. The last column has the labels for prediction : 1 for spam, 0 for not spam. The remaining 3000 columns are the 3000 most common words in all the emails, after excluding the non-alphabetical characters/words. For each row, the count of each word(column) in that email(row) is stored in the respective cells. Thus, information regarding all 5172 emails are stored in a compact dataframe rather than as separate text files.

### Import libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

### Import Data

In [2]:
# Import data

df = pd.read_csv("/content/drive/MyDrive/DS Course Uploads/Datasets/emails.csv")
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


### EDA

In [3]:
df.shape

(5172, 3002)

In [4]:
df.isnull().sum().sum()

0

In [5]:
df.duplicated().sum()

0

In [6]:
df.drop('Email No.', axis=1, inplace=True)

In [7]:
df.shape

(5172, 3001)

In [8]:
# Separating input and output

X = df.drop('Prediction', axis=1)
y = df['Prediction']

In [9]:
# Perform train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Modeling

In [10]:
# Create model and fit

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [11]:
# Evaluate the model

print("Accuracy:", metrics.accuracy_score(y_test, y_pred).round(2))
print("Precision:", metrics.precision_score(y_test, y_pred).round(2))
print("Recall:", metrics.recall_score(y_test, y_pred).round(2))
print("F1 Score:", metrics.f1_score(y_test, y_pred).round(2))

Accuracy: 0.92
Precision: 0.87
Recall: 0.84
F1 Score: 0.86


In [12]:
# Evaluate training model

y_pred_train = model.predict(X_train)

print("Accuracy:", metrics.accuracy_score(y_train, y_pred_train).round(2))
print("Precision:", metrics.precision_score(y_train, y_pred_train).round(2))
print("Recall:", metrics.recall_score(y_train, y_pred_train).round(2))
print("F1 Score:", metrics.f1_score(y_train, y_pred_train).round(2))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


### Using GridSearchCV for better result

In [13]:
# Implement PCA

from sklearn.decomposition import PCA

# Apply PCA to get 5 principal components

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

# Step 4: Convert the PCA-transformed data back into a DataFrame
df_pca = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(10)])
df_pca.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,-171.454343,-0.504813,2.292223,-0.194811,3.410022,3.647227,-5.303192,-2.606793,4.312173,0.260953
1,147.124377,32.352998,-15.867053,-9.687412,19.591446,3.208663,26.153818,-32.013139,-6.808709,-8.442912
2,-164.429274,-0.896863,8.214613,-0.250605,4.552191,5.611631,-6.69556,6.247112,1.845562,-4.009097
3,-27.271078,36.202936,-25.137564,-3.407249,-4.223474,-1.716627,-4.077654,-20.334683,0.607694,-2.933297
4,-17.538224,25.219895,-17.442093,6.747271,0.995793,-0.547351,-3.234115,3.108532,3.31163,-15.640367


In [16]:
# Creating parameter dictionary

params = {
    'max_leaf_nodes': list(range(2, 10)),
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 3, 4, 5],
    'min_impurity_decrease': [0.0001, 0.001, 0.01, 0.1]
}

# import libraries

from sklearn.model_selection import GridSearchCV

# optimizing the algorithm

clf = DecisionTreeClassifier()
grid_search_cv = GridSearchCV(estimator=clf, param_grid=params, cv=5, verbose=1)
grid_search_cv.fit(df_pca, y)

print("Best parameters:", grid_search_cv.best_params_)
print("Best score:", grid_search_cv.best_score_.round(2))

Fitting 5 folds for each of 512 candidates, totalling 2560 fits
Best parameters: {'criterion': 'gini', 'max_leaf_nodes': 9, 'min_impurity_decrease': 0.0001, 'min_samples_split': 2, 'splitter': 'best'}
Best score: 0.79


### Model optimisation

In [17]:
# Use Optimised model

dtc = DecisionTreeClassifier(criterion='gini', max_leaf_nodes=9, min_impurity_decrease=0.0001, splitter='best', min_samples_split=2)
dtc.fit(X_train, y_train)
y_opt_pred = dtc.predict(X_test)

In [18]:
# Evaluate optimised model

print("Accuracy:", metrics.accuracy_score(y_test, y_opt_pred).round(2))
print("Precision:", metrics.precision_score(y_test, y_opt_pred).round(2))
print("Recall:", metrics.recall_score(y_test, y_opt_pred).round(2))
print("F1 Score:", metrics.f1_score(y_test, y_opt_pred).round(2))

Accuracy: 0.85
Precision: 0.67
Recall: 0.94
F1 Score: 0.78


In [19]:
# Evaluate training model

y_opt_pred_train = dtc.predict(X_train)

print("Accuracy:", metrics.accuracy_score(y_train, y_opt_pred_train).round(2))
print("Precision:", metrics.precision_score(y_train, y_opt_pred_train).round(2))
print("Recall:", metrics.recall_score(y_train, y_opt_pred_train).round(2))
print("F1 Score:", metrics.f1_score(y_train, y_opt_pred_train).round(2))

Accuracy: 0.86
Precision: 0.69
Recall: 0.96
F1 Score: 0.8
