In [125]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

**DATA COLLECTION AND PRE-PROCESSING**

In [126]:
df = pd.read_csv('mail_data.csv')
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


Ham -> 0

Spam -> 1

In [127]:
df.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [128]:
df.drop(columns=['label'],inplace = True)

In [129]:
df = df.where((pd.notnull(df)),'')

In [130]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label_num
0,605,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,"Subject: photoshop , windows , office . cheap ...",1
4,2030,Subject: re : indian springs\r\nthis deal is t...,0


In [131]:
df.shape

(5171, 3)

In [132]:
X = df['text']
Y = df['label_num']

**Feature Extraction**

In [133]:
#transforming the data  to feature vectors that can be used as input to the logistic regression
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_features = feature_extraction.fit_transform(X)
#X_train_features = feature_extraction.fit_transform(X_train)
#X_test_features = feature_extraction.fit_transform(X_test)
X_train_features, X_test_features, Y_train, Y_test = train_test_split(X_features, Y, test_size=0.2, random_state=2)

**Model Training : Logistic Regression**

In [134]:
model =   LogisticRegression()

In [135]:

#training the model with training data
model.fit(X_train_features,Y_train)

In [136]:
#Evaluating the trained model
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train,prediction_on_training_data)
print('Accuracy on train data :',accuracy_on_training_data)

Accuracy on train data : 0.9961315280464217


In [137]:
#prediction on test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_test_data)
print('Accuracy on test data : ',accuracy_on_test_data)

Accuracy on test data :  0.9874396135265701


In [138]:
#predicting mails
input_mail = ['We facilitate more than 80 billion transactions per year! Thats a lot of satisfied users. Its our imperative to give you the best experience.']
#converting text to feature vectors
intput_data_features = feature_extraction.transform(input_mail)
#making prediction
prediction = model.predict(intput_data_features)
print(prediction)
if(prediction[0]==0):
    print('Ham mail')
else:
    print('Spam mail')

[1]
Spam mail
