<a href="https://colab.research.google.com/github/Devshray/SIMBT/blob/main/Email_Spam_Detetion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

Data Collection & Pre-Processing

In [2]:
raw_mail_data = pd.read_csv('/content/spam_ham_dataset.csv')

In [3]:
print(raw_mail_data)

     Category                                          Message\n
0         ham  enron methanol ; meter # : 988291\r\nthis is a...
1         ham  hpl nom for january 9 , 2001\r\n( see attached...
2         ham  neon retreat\r\nho ho ho , we ' re around to t...
3        spam  photoshop , windows , office . cheap . main tr...
4         ham  re : indian springs\r\nthis deal is to book th...
...       ...                                                ...
5152      ham  put the 10 on the ft\r\nthe transport volumes ...
5153      ham  3 / 4 / 2000 and following noms\r\nhpl can ' t...
5154      ham  calpine daily gas nomination\r\n>\r\n>\r\njuli...
5155      ham  industrial worksheets for august 2000 activity...
5156     spam  important online banking alert\r\ndear valued ...

[5157 rows x 2 columns]


In [4]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [5]:
mail_data.head()

Unnamed: 0,Category,Message\n
0,ham,enron methanol ; meter # : 988291\r\nthis is a...
1,ham,"hpl nom for january 9 , 2001\r\n( see attached..."
2,ham,"neon retreat\r\nho ho ho , we ' re around to t..."
3,spam,"photoshop , windows , office . cheap . main tr..."
4,ham,re : indian springs\r\nthis deal is to book th...


In [6]:
mail_data.shape

(5157, 2)

Label Encoding

In [7]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

spam  -  0

ham  -  1

In [8]:
X = mail_data['Message\n']

Y = mail_data['Category']

In [9]:
print(X)

0       enron methanol ; meter # : 988291\r\nthis is a...
1       hpl nom for january 9 , 2001\r\n( see attached...
2       neon retreat\r\nho ho ho , we ' re around to t...
3       photoshop , windows , office . cheap . main tr...
4       re : indian springs\r\nthis deal is to book th...
                              ...                        
5152    put the 10 on the ft\r\nthe transport volumes ...
5153    3 / 4 / 2000 and following noms\r\nhpl can ' t...
5154    calpine daily gas nomination\r\n>\r\n>\r\njuli...
5155    industrial worksheets for august 2000 activity...
5156    important online banking alert\r\ndear valued ...
Name: Message\n, Length: 5157, dtype: object


In [10]:
print(Y)

0       1
1       1
2       1
3       0
4       1
       ..
5152    1
5153    1
5154    1
5155    1
5156    0
Name: Category, Length: 5157, dtype: object


Splitting the data into training data & test data

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [12]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5157,)
(4125,)
(1032,)


Feature Extraction

In [13]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [14]:
print(X_train)

1723    urgent reply\r\noverseas stake lottery\r\ninte...
3370    resume - dart arnaez\r\ninternal candidate res...
413     note ! citibank account suspend in process\r\n...
4796    panenergy marketing march 2000 production\r\nd...
3167    holiday on - call data\r\npipeline contact pho...
                              ...                        
789     vacation\r\ni will be on vacation friday , jun...
968     software for home and office .\r\nlet your chi...
1667    ok\r\nnew offshore pharmacy - not a single med...
3321    carthage ( american central ) nomination\r\n- ...
1688    do you like computers\r\nincredible offers :\r...
Name: Message\n, Length: 4125, dtype: object


In [15]:
print(X_train_features)

  (0, 42937)	0.038440894091013596
  (0, 21637)	0.03296005020038876
  (0, 26292)	0.03672923058523835
  (0, 18723)	0.038915892528590255
  (0, 12461)	0.07427023998930961
  (0, 20622)	0.07427023998930961
  (0, 42451)	0.05703927457560205
  (0, 28117)	0.05860068354328377
  (0, 36617)	0.09129177138899644
  (0, 33538)	0.06265123721624569
  (0, 11648)	0.08245624959545476
  (0, 43236)	0.05813055849743684
  (0, 21588)	0.09129177138899644
  (0, 32286)	0.05163674594897915
  (0, 4923)	0.06164322988293046
  (0, 43300)	0.05163674594897915
  (0, 14858)	0.0870035431722878
  (0, 6799)	0.09129177138899644
  (0, 39176)	0.0387674539922684
  (0, 12111)	0.07149545661227642
  (0, 31869)	0.04542027180253812
  (0, 37144)	0.047307124050864596
  (0, 4929)	0.061327502792310674
  (0, 10532)	0.03640010418698775
  (0, 18963)	0.069982011772601
  :	:
  (4124, 37819)	0.07676172951770091
  (4124, 1840)	0.1450058077653909
  (4124, 779)	0.05710655749036146
  (4124, 170)	0.05813199520913208
  (4124, 41110)	0.0759966433112099

Training the Model

Logistic Regression

In [16]:
model = LogisticRegression()

In [17]:
model.fit(X_train_features, Y_train)

Evaluating the trained model

In [18]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [19]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9963636363636363


In [20]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [21]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9874031007751938


Building a Predictive System

In [22]:
input_mail = ["it is my understanding that teco just sends us a check , i haven  t received an answer as to whether there is a predermined price associated with this deal or if teco just lets us know what we are giving . i can continue to chase this deal down if you need ."]

input_data_features = feature_extraction.transform(input_mail)


prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail


In [23]:
input_mail = ["looking for medication ? we ` re the best source .it is difficult to make our material condition better by the best law , but it is easy enough to ruin it by bad laws .excuse me . . . : ) you just found thebest and simpliest site for medication on the net . no perscription , easy delivery ."]

input_data_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
