# Downloading the dependencies

#### Uncoment and run the following code to install the required libraries

In [1]:
# !pip install numpy
# !pip install pandas
# !pip install sklearn

### Path to the Dataset : https://www.kaggle.com/code/ayhampar/spam-ham-dataset/data

Importing the Dependencies

In [42]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [45]:


# for file in the content folder of the drive => used for colab
raw_mail_data = pd.read_csv('spam_ham_dataset.csv')


In [47]:
raw_mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [49]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [51]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [53]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5171, 4)

Label Encoding

In [56]:
# label spam mail as 1;  ham mail as 0;

mail_data.loc[mail_data['label'] == 'spam', 'Category',] = 1
mail_data.loc[mail_data['label'] == 'ham', 'Category',] = 0

spam  -  0

ham  -  1

In [59]:
# separating the data as texts and label

X = mail_data['text']

Y = mail_data['label_num']

In [61]:
print(X)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [63]:
print(Y)

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64


Splitting the data into training data & test data

In [66]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)

In [68]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5171,)
(4136,)
(1035,)


Feature Extraction

In [71]:
# transform the text data to feature vectors that can be used as input to the Logistic regression
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [72]:
print(X_train)

3670    Subject: ami , , , ,\r\ni agree ! !\r\nthanks ...
2420    Subject: gas fundamentals website update\r\n- ...
3395    Subject: re : coastal ctr # 96008903 meter 098...
3739    Subject: best choice rx - free online prescrip...
347     Subject: enron / hpl noms for november 16 , 20...
                              ...                        
4298    Subject: enron actuals for june 22 , 2000\r\nt...
4706    Subject: air products - plant down for 4 days ...
1144    Subject: hpl nom for august 2 , 2000\r\n( see ...
48      Subject: instant download 1300 popular softwar...
772     Subject: escalation procedures - gas logistics...
Name: text, Length: 4136, dtype: object


In [75]:
print(X_train_features)

  (0, 38807)	0.044267716062679825
  (0, 5547)	0.2783505997212704
  (0, 4975)	0.10143491262613565
  (0, 39951)	0.050297273882500174
  (0, 18921)	0.06031782367850875
  (0, 40225)	0.26868864099740764
  (0, 32549)	0.3109869503405231
  (0, 26468)	0.22359889199585714
  (0, 16848)	0.22421803098437884
  (0, 146)	0.1657485042131736
  (0, 47)	0.13185525133445244
  (0, 988)	0.09760409465214956
  (0, 328)	0.13185525133445244
  (0, 1418)	0.08628620136931787
  (0, 10929)	0.09661166824590095
  (0, 16832)	0.1498120974660256
  (0, 11768)	0.11564244844904173
  (0, 479)	0.16744479281794494
  (0, 41019)	0.2206267884011947
  (0, 10690)	0.10735782931351906
  (0, 38566)	0.10613465302929395
  (0, 39894)	0.0774415119510861
  (0, 41799)	0.1086525760475817
  (0, 13678)	0.05776763132174826
  (0, 17954)	0.06877290702753514
  :	:
  (4135, 26880)	0.1127542040495392
  (4135, 26235)	0.09209325721800932
  (4135, 34483)	0.0850746272962729
  (4135, 34574)	0.19269167990190333
  (4135, 33214)	0.1102429586977326
  (4135, 16

# Training the Model

## Logistic Regression

In [80]:
model = LogisticRegression()

In [82]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Evaluating the trained model

In [85]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [87]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9963733075435203


In [89]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [91]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9826086956521739


In [93]:
model.score(X_test_features,Y_test)

0.9826086956521739

## Logistic Regression with Cross Validation

In [96]:
logcv = LogisticRegressionCV(cv=5)

In [98]:
logcv.fit(X_train_features,Y_train)

0,1,2
,Cs,10
,fit_intercept,True
,cv,5
,dual,False
,penalty,'l2'
,scoring,
,solver,'lbfgs'
,tol,0.0001
,max_iter,100
,class_weight,


In [99]:
accuracy_score(Y_train,model.predict(X_train_features))

0.9963733075435203

In [102]:
accuracy_score(Y_test,model.predict(X_test_features))

0.9826086956521739

In [104]:
logcv.score(X_test_features,Y_test)

0.9874396135265701

## Decision Tree


In [107]:
from sklearn.tree import DecisionTreeClassifier

In [109]:
tree = DecisionTreeClassifier(random_state=3, criterion="gini", splitter="random" )
tree.fit(X_train_features,Y_train)

0,1,2
,criterion,'gini'
,splitter,'random'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,3
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [110]:
accuracy_score(Y_train,tree.predict(X_train_features))

1.0

In [111]:
accuracy_score(Y_test,tree.predict(X_test_features))

0.9632850241545894

In [115]:
tree.score(X_test_features,Y_test)

0.9632850241545894

In [117]:
# !pip install scipy

## SVM - Support Vector Machine

**By far the best test accuracy : 98.7%**

In [121]:
from sklearn.svm import SVC

In [123]:
svm_model = SVC()

In [125]:
svm_model.fit(X_train_features,Y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [126]:
accuracy_score(Y_train,svm_model.predict(X_train_features))

1.0

In [127]:
accuracy_score(Y_test,svm_model.predict(X_test_features))

0.9874396135265701

In [128]:
svm_model.score(X_test_features,Y_test)

0.9874396135265701

## Random Forest Classifier

In [132]:
from sklearn.ensemble import RandomForestClassifier

In [134]:
rfc = RandomForestClassifier(n_estimators=10)

In [138]:
rfc.fit(X_train_features,Y_train)

0,1,2
,n_estimators,10
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [140]:
accuracy_score(Y_train,rfc.predict(X_train_features))

1.0

In [142]:
accuracy_score(Y_test,rfc.predict(X_test_features))

0.970048309178744

In [144]:
rfc.score(X_test_features,Y_test)

0.970048309178744

## Extra Tree Classifier

In [147]:
from sklearn.ensemble import ExtraTreesClassifier

In [149]:
xtc = ExtraTreesClassifier(n_estimators=10)

In [151]:
xtc.fit(X_train_features,Y_train)

0,1,2
,n_estimators,10
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [153]:
accuracy_score(Y_train,rfc.predict(X_train_features))

1.0

In [155]:
accuracy_score(Y_train,rfc.predict(X_train_features))

1.0

In [157]:
rfc.score(X_test_features,Y_test)

0.970048309178744

Model saving

In [160]:
import joblib

# Save Logistic Regression
joblib.dump(model, 'logistic_regression_model.pkl')

# Save Logistic Regression CV
joblib.dump(logcv, 'logistic_regression_cv_model.pkl')

# Save Decision Tree
joblib.dump(tree, 'decision_tree_model.pkl')

# Save SVM (if defined as svm_model)
joblib.dump(svm_model, 'svm_model.pkl')

# Save Random Forest
joblib.dump(rfc, 'random_forest_model.pkl')

# Save Extra Trees (assuming you've fitted it later)
xtc.fit(X_train_features, Y_train)  # Make sure to fit before saving
joblib.dump(xtc, 'extra_trees_model.pkl')


['extra_trees_model.pkl']

In [164]:
# You already have this code in your notebook
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)

In [166]:
# ADD THIS CELL AND RUN IT
import joblib

joblib.dump(feature_extraction, 'tfidf_vectorizer.pkl')

print("✅ Vectorizer has been successfully saved as 'tfidf_vectorizer.pkl'!")

✅ Vectorizer has been successfully saved as 'tfidf_vectorizer.pkl'!


## A basic interface to take an email as input and then classify it as a spam mail or a ham mail

In [162]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)

print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail
