In [1]:
import pandas as pd
import numpy as np

### Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

### To ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error
#Importing SVM model
from sklearn import svm
from xgboost import XGBClassifier, plot_tree


In [2]:
# Load the dataset into a pandas DataFrame
Emails=pd.read_csv('emails.csv')


In [3]:
Emails.head()


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
Emails.tail()

Unnamed: 0,text,spam
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


In [5]:
Emails.dtypes

text    object
spam     int64
dtype: object

In [6]:
Emails.describe(include='all')

Unnamed: 0,text,spam
count,5728,5728.0
unique,5695,
top,"Subject: re : contact info glenn , please , ...",
freq,2,
mean,,0.238827
std,,0.426404
min,,0.0
25%,,0.0
50%,,0.0
75%,,0.0


In [7]:
Emails.nunique()

text    5695
spam       2
dtype: int64

In [8]:
Emails.isnull().sum()

text    0
spam    0
dtype: int64

In [9]:
Emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [10]:
Emails.text[Emails.text==0]=np.nan

In [11]:
#checking duplicates
Emails.duplicated().sum()

33

In [12]:
Duplicated_rows= Emails[Emails.duplicated()]
print(Duplicated_rows)

                                                   text  spam
2155  Subject: research allocations to egm  hi becky...     0
2260  Subject: departure of grant masson  the resear...     0
2412  Subject: re : schedule and more . .  jinbaek ,...     0
2473  Subject: day off tuesday  stinson ,  i would l...     0
2763  Subject: re : your mail  zhendong ,  dr . kami...     0
3123  Subject: re : grades  pam ,  the students rese...     0
3152  Subject: tiger evals - attachment  tiger hosts...     0
3248  Subject: re : i am zhendong  zhendong ,  thank...     0
3249  Subject: hello from enron  dear dr . mcmullen ...     0
3387  Subject: term paper  dr . kaminski ,  attached...     0
3573  Subject: telephone interview with the enron re...     0
3660  Subject: re : summer work . .  jinbaek ,  this...     0
3690  Subject: re : weather and energy price data  m...     0
3823  Subject: research get - together at sandeep ko...     0
4203  Subject: re : willow and pathstar evaluations ...     0
4390  Su

In [13]:
Emails=Emails.drop_duplicates()
Emails

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [14]:
Emails.isnull().sum()

text    0
spam    0
dtype: int64

Independent features are declared in X and dependent feature is declared in y.

In [15]:
Emails.columns

Index(['text', 'spam'], dtype='object')

In [16]:
X=Emails[['text']]
X.head()
y=Emails[['spam']]
y.head()

Unnamed: 0,spam
0,1
1,1
2,1
3,1
4,1


In [17]:
X.head()

Unnamed: 0,text
0,Subject: naturally irresistible your corporate...
1,Subject: the stock trading gunslinger fanny i...
2,Subject: unbelievable new homes made easy im ...
3,Subject: 4 color printing special request add...
4,"Subject: do not have money , get software cds ..."


In [18]:
#transforming X variable
from sklearn.feature_extraction.text import CountVectorizer
vectorizer= CountVectorizer()

# fit and transform the 'test' column
X_encoded= vectorizer.fit_transform(Emails['text'])
#X_encoded

In [19]:
#X_encoded = pd.get_dummies(X,columns=['text'])
#X_encoded_list= [X_encoded]
#X_encoded_list

In [20]:
#X_encoded_list.shape()

In [21]:
X_encoded.dtype

dtype('int64')

In [22]:
### random state train test split will be same with all people using random_state=16
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.15, random_state=16)

In [23]:
#print(X_train.head())
#print(y_train.head())
#print(X_test.head())
#print(y_test.head())

In [24]:
### Crating a standard scaler object
scaler=StandardScaler()
scaler

In [32]:
### using fit_transform to Standardize the train data
X_train=scaler.fit_transform(X_train)
print(X_train)

### here using transform only to avoid data leakage
### (training mean and training std will be used for standardisation when we use transform)
X_test=scaler.transform(X_test)
print(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, fbeta_score
#from sklearn.metrics import accuracy_score, precision_score, recall_score

LogisticRegression

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, fbeta_score
#from sklearn.metrics import accuracy_score, precision_score, recall_score

In [27]:
### Creating a Logistic regression object
logistic_reg=LogisticRegression(random_state = 0)

### Passing independant and dependant training data to the model
logistic_reg.fit(X_train,y_train)

logistic_reg_pred=logistic_reg.predict(X_test)
logistic_reg_pred

logistic_reg_pred_train=logistic_reg.predict(X_train)
logistic_reg_pred_train

# Evaluate the model performance
accuracy =accuracy_score(y_test, logistic_reg_pred)
precision = precision_score(y_test, logistic_reg_pred)
recall = recall_score(y_test, logistic_reg_pred)

# Print the model's coefficients and evaluation metrics
print("Coefficients:", logistic_reg.coef_)
print("Intercept:", logistic_reg.intercept_)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

### accuracy using accuracy_score
print('logistic_reg_accuracy_test:',round(accuracy_score(y_test, logistic_reg_pred), 2))

print('logistic_reg_accuracy_train:',round(accuracy_score(y_train, logistic_reg_pred_train), 2))

Coefficients: [[-2.10094694e-01  8.43470241e-02 -4.72746103e-04 ... -3.06892986e-06
  -1.33807864e-05  2.02592533e-03]]
Intercept: [-0.72023459]
Accuracy: 0.9929824561403509
Precision: 0.9901477832512315
Recall: 0.9804878048780488
logistic_reg_accuracy_test: 0.99
logistic_reg_accuracy_train: 1.0


Random Forest

In [28]:
### Creating a Logistic regression object
from sklearn.ensemble import RandomForestClassifier
RandomForsestReg=RandomForestClassifier(n_estimators= 10, criterion="entropy")

### Passing independant and dependant training data to the model
RandomForsestReg.fit(X_train,y_train)

RandomForsestReg_pred=RandomForsestReg.predict(X_test)
RandomForsestReg_pred

RandomForsestReg_pred_train=RandomForsestReg.predict(X_train)
RandomForsestReg_pred_train

# Evaluate the model performance
accuracy =accuracy_score(y_test, RandomForsestReg_pred)
precision = precision_score(y_test, RandomForsestReg_pred)
recall = recall_score(y_test, RandomForsestReg_pred)

# Print the model's coefficients and evaluation metrics
#print("Coefficients:", RandomForsestReg_pred.coef_)
#print("Intercept:", RandomForsestReg_pred.intercept_)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

### accuracy using accuracy_score
print('logistic_reg_accuracy_test:',round(accuracy_score(y_test, RandomForsestReg_pred), 2))

print('logistic_reg_accuracy_train:',round(accuracy_score(y_train, RandomForsestReg_pred_train), 2))

Accuracy: 0.9415204678362573
Precision: 1.0
Recall: 0.7560975609756098
logistic_reg_accuracy_test: 0.94
logistic_reg_accuracy_train: 1.0


Support vector Machine

In [29]:
#Importing SVM model
from sklearn import svm

#Creating a svm Classifier
SupportVM = svm.SVC(kernel='linear') # Linear Kernel

### Passing independant and dependant training data to the model
SupportVM.fit(X_train,y_train)

SupportVM_pred=SupportVM.predict(X_test)
SupportVM_pred

SupportVM_pred_train=SupportVM.predict(X_train)
SupportVM_pred_train

# Evaluate the model performance
accuracy =accuracy_score(y_test, SupportVM_pred)
precision = precision_score(y_test, SupportVM_pred)
recall = recall_score(y_test,SupportVM_pred)

# Print the model's coefficients and evaluation metrics
#print("Coefficients:", RandomForsestReg_pred.coef_)
#print("Intercept:", RandomForsestReg_pred.intercept_)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

### accuracy using accuracy_score
print('logistic_reg_accuracy_test:',round(accuracy_score(y_test, SupportVM_pred), 2))

print('logistic_reg_accuracy_train:',round(accuracy_score(y_train, SupportVM_pred_train), 2))

Accuracy: 0.991812865497076
Precision: 0.9900990099009901
Recall: 0.975609756097561
logistic_reg_accuracy_test: 0.99
logistic_reg_accuracy_train: 1.0


Gadient Boosting

In [30]:
# initialzing model
GradientBoosting= GradientBoostingClassifier(n_estimators=100, random_state=50)

### Passing independant and dependant training data to the model
GradientBoosting.fit(X_train,y_train)

GradientBoosting_pred=GradientBoosting.predict(X_test)
GradientBoosting_pred

GradientBoosting_pred_train=GradientBoosting.predict(X_train)
GradientBoosting_pred_train

# Evaluate the model performance
accuracy =accuracy_score(y_test, GradientBoosting_pred)
precision = precision_score(y_test, GradientBoosting_pred)
recall = recall_score(y_test,GradientBoosting_pred)

# Print the model's coefficients and evaluation metrics
#print("Coefficients:", RandomForsestReg_pred.coef_)
#print("Intercept:", RandomForsestReg_pred.intercept_)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

### accuracy using accuracy_score
print('logistic_reg_accuracy_test:',round(accuracy_score(y_test, GradientBoosting_pred), 2))

print('logistic_reg_accuracy_train:',round(accuracy_score(y_train, GradientBoosting_pred_train), 2))

Accuracy: 0.9754385964912281
Precision: 0.9693877551020408
Recall: 0.926829268292683
logistic_reg_accuracy_test: 0.98
logistic_reg_accuracy_train: 0.99


In [31]:
#printing Results
print('logistic_reg_accuracy_test:',round(accuracy_score(y_test, logistic_reg_pred), 2))
print('logistic_reg_accuracy_test:',round(accuracy_score(y_test, RandomForsestReg_pred), 2))
print('logistic_reg_accuracy_test:',round(accuracy_score(y_test, SupportVM_pred), 2))
print('logistic_reg_accuracy_test:',round(accuracy_score(y_test, GradientBoosting_pred), 2))

logistic_reg_accuracy_test: 0.99
logistic_reg_accuracy_test: 0.94
logistic_reg_accuracy_test: 0.99
logistic_reg_accuracy_test: 0.98


We have analyzed the different models and their accuracy.