# Downloading the dependencies

#### Uncoment and run the following code to install the required libraries

In [None]:
# !pip install numpy
# !pip install pandas
# !pip install sklearn

### Path to the Dataset : https://www.kaggle.com/code/ayhampar/spam-ham-dataset/data

Importing the Dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [3]:
# loading the data from csv file to a pandas Dataframe

# for file in the content folder of the drive => used for colab
raw_mail_data = pd.read_csv('/content/spam_ham_dataset.csv')

# if file is present in Local directory
# raw_mail_data = pd.read_csv('./spam_ham_dataset.csv')

In [6]:
raw_mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [7]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [12]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,Category
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,0.0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,0.0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,0.0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,1.0
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,0.0


In [9]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5171, 4)

Label Encoding

In [11]:
# label spam mail as 1;  ham mail as 0;

mail_data.loc[mail_data['label'] == 'spam', 'Category',] = 1
mail_data.loc[mail_data['label'] == 'ham', 'Category',] = 0

spam  -  0

ham  -  1

In [13]:
# separating the data as texts and label

X = mail_data['text']

Y = mail_data['label_num']

In [14]:
print(X)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [15]:
print(Y)

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64


Splitting the data into training data & test data

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)

In [17]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5171,)
(4136,)
(1035,)


Feature Extraction

In [18]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [19]:
print(X_train)

3670    Subject: ami , , , ,\r\ni agree ! !\r\nthanks ...
2420    Subject: gas fundamentals website update\r\n- ...
3395    Subject: re : coastal ctr # 96008903 meter 098...
3739    Subject: best choice rx - free online prescrip...
347     Subject: enron / hpl noms for november 16 , 20...
                              ...                        
4298    Subject: enron actuals for june 22 , 2000\r\nt...
4706    Subject: air products - plant down for 4 days ...
1144    Subject: hpl nom for august 2 , 2000\r\n( see ...
48      Subject: instant download 1300 popular softwar...
772     Subject: escalation procedures - gas logistics...
Name: text, Length: 4136, dtype: object


In [20]:
print(X_train_features)

  (0, 4762)	0.10012704693997622
  (0, 1495)	0.07871430826807463
  (0, 3276)	0.18214203829480605
  (0, 1972)	0.15782550986594066
  (0, 92)	0.17577452414499667
  (0, 13586)	0.0818181246004286
  (0, 19528)	0.055895326759121025
  (0, 1297)	0.08707678033878398
  (0, 1098)	0.18214203829480605
  (0, 3611)	0.16680001700546868
  (0, 871)	0.08451454744122779
  (0, 13779)	0.20527094274941132
  (0, 3046)	0.18214203829480605
  (0, 42545)	0.21657657436365757
  (0, 1)	0.1238746742608378
  (0, 1205)	0.1401514538809242
  (0, 22393)	0.10786667604611345
  (0, 29757)	0.09821627339592419
  (0, 18361)	0.11475501088500686
  (0, 10335)	0.052801855768404946
  (0, 17954)	0.06877290702753515
  (0, 13678)	0.05776763132174827
  (0, 41799)	0.10865257604758172
  (0, 39894)	0.07744151195108612
  (0, 38566)	0.10613465302929398
  :	:
  (4135, 22636)	0.12004667938584257
  (4135, 16779)	0.12718351462611455
  (4135, 33214)	0.11024295869773258
  (4135, 34574)	0.1926916799019033
  (4135, 34483)	0.08507462729627288
  (4135, 

# Training the Model

## Logistic Regression

In [21]:
model = LogisticRegression()

In [22]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

LogisticRegression()

Evaluating the trained model

In [23]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [24]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.996615087040619


In [25]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [26]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9826086956521739


In [39]:
model.score(X_test_features,Y_test)

0.9826086956521739

## Logistic Regression with Cross Validation

In [27]:
logcv = LogisticRegressionCV(cv=5)

In [28]:
logcv.fit(X_train_features,Y_train)

LogisticRegressionCV(cv=5)

In [29]:
accuracy_score(Y_train,model.predict(X_train_features))

0.996615087040619

In [30]:
accuracy_score(Y_test,model.predict(X_test_features))

0.9826086956521739

In [36]:
logcv.score(X_test_features,Y_test)

0.9884057971014493

## Decision Tree


In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
tree = DecisionTreeClassifier(random_state=3, criterion="gini", splitter="random" )
tree.fit(X_train_features,Y_train)

DecisionTreeClassifier(random_state=3, splitter='random')

In [33]:
accuracy_score(Y_train,tree.predict(X_train_features))

1.0

In [34]:
accuracy_score(Y_test,tree.predict(X_test_features))

0.9632850241545894

In [37]:
tree.score(X_test_features,Y_test)

0.9632850241545894

In [42]:
# !pip install scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## SVM - Support Vector Machine

**By far the best test accuracy : 98.7%**

In [43]:
from sklearn.svm import SVC

In [46]:
svm_model = SVC()

In [47]:
svm_model.fit(X_train_features,Y_train)

SVC()

In [49]:
accuracy_score(Y_train,svm_model.predict(X_train_features))

1.0

In [50]:
accuracy_score(Y_test,svm_model.predict(X_test_features))

0.9874396135265701

In [51]:
svm_model.score(X_test_features,Y_test)

0.9874396135265701

## Random Forest Classifier

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rfc = RandomForestClassifier(n_estimators=10)

In [56]:
rfc.fit(X_train_features,Y_train)

RandomForestClassifier(n_estimators=10)

In [57]:
accuracy_score(Y_train,rfc.predict(X_train_features))

0.9983075435203095

In [58]:
accuracy_score(Y_test,rfc.predict(X_test_features))

0.9642512077294686

In [59]:
rfc.score(X_test_features,Y_test)

0.9642512077294686

## Extra Tree Classifier

In [60]:
from sklearn.ensemble import ExtraTreesClassifier

In [62]:
xtc = ExtraTreesClassifier(n_estimators=10)

In [63]:
rfc.fit(X_train_features,Y_train)

RandomForestClassifier(n_estimators=10)

In [64]:
accuracy_score(Y_train,rfc.predict(X_train_features))

0.9990328820116054

In [65]:
accuracy_score(Y_train,rfc.predict(X_train_features))

0.9990328820116054

In [66]:
rfc.score(X_test_features,Y_test)

0.9642512077294686

In [67]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)

print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail
