IMPORTING DEPENDENCIES

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # convert text into featured vectors(numbers)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

DATA COLLECTION & PRE-PROCESSING

In [4]:
raw_mail_data = pd.read_csv('C:\Dhanesh\ML\ML_DBMS\spam_ham_dataset.csv\spam_ham_dataset.csv')
raw_mail_data = raw_mail_data.drop('Unnamed: 0',axis=1)
print(raw_mail_data)

# HAM --> 0
# SPAM --> 1



     label                                               text  label_num
0      ham  Subject: enron methanol ; meter # : 988291\r\n...          0
1      ham  Subject: hpl nom for january 9 , 2001\r\n( see...          0
2      ham  Subject: neon retreat\r\nho ho ho , we ' re ar...          0
3     spam  Subject: photoshop , windows , office . cheap ...          1
4      ham  Subject: re : indian springs\r\nthis deal is t...          0
...    ...                                                ...        ...
5166   ham  Subject: put the 10 on the ft\r\nthe transport...          0
5167   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...          0
5168   ham  Subject: calpine daily gas nomination\r\n>\r\n...          0
5169   ham  Subject: industrial worksheets for august 2000...          0
5170  spam  Subject: important online banking alert\r\ndea...          1

[5171 rows x 3 columns]


In [5]:
# replacing missing or null values with null string

raw_mail_data.info()
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')
mail_data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5171 non-null   object
 1   text       5171 non-null   object
 2   label_num  5171 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 121.3+ KB


Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
mail_data.shape

(5171, 3)

In [7]:
# separate the text mail and label_num

X = mail_data['text']
Y = mail_data['label_num']

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

In [9]:
print(mail_data.shape)
print(X_test.shape)
print(X_train.shape)

(5171, 3)
(1035,)
(4136,)


FEATURE EXTRACTION
--> TRANSFORM TEXT DATA TO FEATURED VECTORS TO GIVE AS A INPUT TO LOGISTIC REGRESSION

In [10]:
feature_extraction = TfidfVectorizer(min_df=1 , stop_words='english')  # default lowercase = True

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)


# converting Y_train and Y_test into int data type

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [11]:
print(X_test_features)

  (0, 43624)	0.13512686321787962
  (0, 43268)	0.10833660960605353
  (0, 41075)	0.12096674600753769
  (0, 39565)	0.10188725144329445
  (0, 39501)	0.3118923210296496
  (0, 39154)	0.12422340135939985
  (0, 38946)	0.018950541070485577
  (0, 38525)	0.08481685902012227
  (0, 38018)	0.08352915721085842
  (0, 37918)	0.09087024496070707
  (0, 37715)	0.1504944295855849
  (0, 36849)	0.06211973550290613
  (0, 36826)	0.07205513495571784
  (0, 36678)	0.13132404496188121
  (0, 34678)	0.11576601929181199
  (0, 34313)	0.09107511767034959
  (0, 34303)	0.12422340135939985
  (0, 32725)	0.12675389349953
  (0, 32230)	0.1281582796694795
  (0, 32223)	0.11907011031567737
  (0, 32218)	0.2137183924250268
  (0, 30502)	0.11999471032560592
  (0, 29300)	0.055923031890344886
  (0, 28045)	0.22723675877287497
  (0, 27061)	0.08684642663655563
  :	:
  (1034, 16167)	0.1215291152353375
  (1034, 16052)	0.41400159737005826
  (1034, 14047)	0.11692216400145229
  (1034, 13755)	0.05148325585044388
  (1034, 13656)	0.0436812108792

TRAINING THE LOGISTIC REG MODEL

In [12]:
model = LogisticRegression()
model.fit(X_train_features,Y_train)

EVALUATING TRAINED MODEL

In [13]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

print('Accuracy on training model :',accuracy_on_training_data)

Accuracy on training model : 0.9970986460348162


In [14]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

print('Accuracy on training model :',accuracy_on_test_data)

Accuracy on training model : 0.9806763285024155


In [15]:
# building a predictive model 

input_mail = ["you will get 60percent discount on your first bookings"]

input_data_features = feature_extraction.transform(input_mail)

# making a prediction

prediction = model.predict(input_data_features)

if prediction[0]==1:
    print("Spam mail")

else: print("Ham mail")


Spam mail
