IMPORTING REQUIRED LIBRARIES

In [144]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [145]:
data= pd.read_csv('C:/Users/alvis/Documents/spam_1.csv', encoding='latin-1')

LOADING AND UNDERSTANDING THE DATA

In [146]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [147]:
data.shape

(5572, 5)

In [148]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


CLEANING THE DATA

In [149]:
data = data.where((pd.notnull(df)),'')

In [150]:
data


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [151]:
data.isna().sum()

v1            0
v2            0
Unnamed: 2    0
Unnamed: 3    0
Unnamed: 4    0
dtype: int64

In [152]:
data.drop(['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)

In [153]:
# we have dropped unamed 2,3,4 columns

In [154]:
data.shape

(5572, 2)

In [155]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [156]:
# renaming the columns

In [157]:
data.rename(columns={'v1':'Category','v2':'Message'}, inplace=True)

In [158]:
# labelling spam mails and ham mails

In [159]:
data.loc[data['Category'] == 'spam','Category'] = 0
data.loc[data['Category'] == 'ham','Category'] = 1

In [160]:
X = data['Message']

In [161]:
Y = data['Category']

In [162]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [163]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [164]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,random_state = 4)

In [165]:
# Here, 80% of data will be trained and 20% will be tested

In [166]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)


(5572,)
(4457,)
(1115,)


In [167]:
# Here we can actually see total rows, 80% trained rows and 20% tested rows

In [168]:
print(Y_train.shape)
print(Y_test.shape)

(4457,)
(1115,)


In [169]:
# Same goes for the Y

FEATURE EXTRACTION

In [170]:
feature_extraction = TfidfVectorizer(min_df= 1, stop_words = 'english')

In [171]:
X_train_feature = feature_extraction.fit_transform(X_train)
X_test_feature = feature_extraction.transform(X_test)

In [172]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [173]:
X_train

1457    CLAIRE here am havin borin time & am now alone...
472     Nothing. I meant that once the money enters yo...
2481                    K.:)do it at evening da:)urgent:)
243     Although i told u dat i'm into baig face watch...
1413    Dear U've been invited to XCHAT. This is our f...
                              ...                        
3671     came to look at the flat, seems ok, in his 50...
709     4mths half price Orange line rental & latest c...
2487    K ill drink.pa then what doing. I need srs mod...
174     Well, i'm gonna finish my bath now. Have a goo...
1146                            Babe ? I lost you ... :-(
Name: Message, Length: 4457, dtype: object

In [174]:
print(X_train_feature)

  (0, 4342)	0.2752660426445784
  (0, 1579)	0.25228874452047967
  (0, 7332)	0.22665809194518172
  (0, 4120)	0.20242973085540436
  (0, 3385)	0.16326391252550673
  (0, 245)	0.28870690041506675
  (0, 1706)	0.19110100352911208
  (0, 407)	0.24275230416686927
  (0, 2076)	0.22931144639638096
  (0, 7085)	0.2012241713066945
  (0, 6637)	0.14222571884812368
  (0, 1393)	0.2752660426445784
  (0, 3266)	0.265729602290968
  (0, 1784)	0.5505320852891568
  (1, 1979)	0.1257583256379714
  (1, 2237)	0.14883487154459094
  (1, 5799)	0.16540907045144923
  (1, 4960)	0.1412201605131965
  (1, 730)	0.19167855419831323
  (1, 6785)	0.17228798451830185
  (1, 6381)	0.1298912590754371
  (1, 1696)	0.18634285391994415
  (1, 2273)	0.41650550621034316
  (1, 1189)	0.19167855419831323
  (1, 5484)	0.19855746826516588
  :	:
  (4453, 3892)	0.21046154167831976
  (4453, 4794)	0.21439075808325483
  (4453, 3215)	0.20272493642163564
  (4453, 2909)	0.2934531865831572
  (4453, 3987)	0.20693263607831452
  (4453, 4985)	0.164634973447087

TRAINING THE MODEL

In [175]:
LR = LogisticRegression()

In [176]:
LR.fit(X_train_feature, Y_train)

In [177]:
LogisticRegression()

In [178]:
pred = LR.predict(X_train_feature)

In [179]:
accuracy_score = accuracy_score(Y_train, pred)

In [180]:
print('Accuracy score on Training data :', accuracy_score)

Accuracy score on Training data : 0.9670181736594121


In [181]:
input_your_mail = ['Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030']
input_data_feature = feature_extraction.transform(input_your_mail)
prediction = LR.predict(input_data_feature)
print(prediction)
if(prediction[0] == 1):
    print('Ham mail')
else:
    print('Spam mail')

[0]
Spam mail


In [182]:
input_your_mail = ['I HAVE A DATE ON SUNDAY WITH WILL!!']
input_data_feature = feature_extraction.transform(input_your_mail)
prediction = LR.predict(input_data_feature)
print(prediction)
if(prediction[0] == 1):
    print('Ham mail')
else:
    print('Spam mail')

[1]
Ham mail
