<a href="https://colab.research.google.com/github/1998456/Doc.Daves-Portfolio-/blob/main/Spam_Email_Detection_Model(with_Predictive_system).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Dependencies

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data_Collection

In [7]:
raw_mail_data= pd.read_csv("/content/archive.zip")

In [8]:
raw_mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


# Data_Preprocessing

In [9]:
# Data_cleaning;
mail_data= raw_mail_data.where((pd.notnull(raw_mail_data)),"")

In [10]:
mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [11]:
mail_data= mail_data.drop(['Unnamed: 0','label_num'],axis=1)

In [12]:
mail_data.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [13]:
# Checking Rows & Columns;
mail_data.shape

(5171, 2)

Label_Encoding

In [14]:
# Spam as 0; Ham as 1;
mail_data.loc[mail_data['label']=='spam','label',]=0
mail_data.loc[mail_data['label']=='ham','label',]=1

Seperating feature & Target

In [15]:
x= mail_data['text']
y= mail_data['label']

In [16]:
print(y)

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: object


# Data _ Splitting

In [17]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=3)

In [18]:
# Confirming Split
print(x.shape,x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(5171,) (4136,) (1035,) (4136,) (1035,)


# Feature_Data_Transformation

In [19]:
Feature_trans= TfidfVectorizer (min_df=1,stop_words='english',lowercase=True)

In [20]:
x_train_feat= Feature_trans.fit_transform(x_train)
x_test_feat= Feature_trans.transform(x_test)

# convert y_train & y_test values as integers;
y_train= y_train.astype('int')
y_test= y_test.astype('int')

In [21]:
print(x_train_feat)

  (0, 3871)	0.13387711316973605
  (0, 531)	0.14556222812251965
  (0, 30451)	0.08468916670398006
  (0, 43273)	0.14556222812251965
  (0, 3890)	0.14556222812251965
  (0, 548)	0.14556222812251965
  (0, 37262)	0.11275796314501375
  (0, 2908)	0.11535664415295803
  (0, 456)	0.14556222812251965
  (0, 26297)	0.09506000151609588
  (0, 36190)	0.11400727959297849
  (0, 2478)	0.13872687405852518
  (0, 521)	0.14556222812251965
  (0, 16808)	0.11843023142166303
  (0, 22041)	0.13387711316973605
  (0, 2706)	0.14556222812251965
  (0, 522)	0.14556222812251965
  (0, 32060)	0.07311834410351342
  (0, 19411)	0.04211028825505044
  (0, 2537)	0.13872687405852518
  (0, 517)	0.14556222812251965
  (0, 19429)	0.14556222812251965
  (0, 16637)	0.24438399643390496
  (0, 3875)	0.14556222812251965
  (0, 836)	0.14556222812251965
  :	:
  (4135, 8873)	0.07154271542163933
  (4135, 16161)	0.07327358549803296
  (4135, 26936)	0.07154271542163933
  (4135, 15691)	0.07812310961344454
  (4135, 14167)	0.07812310961344454
  (4135, 11

# Model_Building

In [22]:
model= LogisticRegression ()

In [23]:
# Training Model with train_data;
model.fit(x_train_feat,y_train)

# Model_Evaluation

In [24]:
# Evaluating_trained_model;
pred_train_data= model.predict(x_train_feat)
accuracy_train_data= accuracy_score(y_train,pred_train_data)

In [25]:
print('Accuracy on training data:', accuracy_train_data)

Accuracy on training data: 0.9970986460348162


In [26]:
# Prediction on test data;
pred_test_data= model.predict(x_test_feat)
accuracy_test_data= accuracy_score(y_test,pred_test_data)

In [27]:
print('Accuracy of test data:',accuracy_test_data)

Accuracy of test data: 0.9806763285024155


# Build_Predictive_System

In [30]:
input_mail= ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."]
# convert text to feature_vectors;
input_data_feat= Feature_trans.transform(input_mail)
# Making Prediction;
prediction= model.predict(input_data_feat)
print(prediction)
if (prediction [0]==1):
   print("Ham mail")
else:
    print("Spam mail")

[0]
Spam mail
