In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [22]:
raw_mail_data = pd.read_csv(r"C:\Users\USER\Documents\Machine Learning Project\sparm mail\mail_data.csv")

In [23]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
raw_mail_data.shape

(5572, 2)

Replace the null values with a null string

In [25]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [26]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
raw_mail_data.shape

(5572, 2)

Label Encoding

In [28]:
mail_data.replace({'Category':{'spam':0, 'ham':1}}, inplace= True)

In [29]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


separating the data as text and label

In [30]:
x = mail_data['Message']
y= mail_data['Category']

In [31]:
print(x.head())

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object


In [32]:
print(y.head())

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: int64


Splitting data into training and test data

In [33]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2, random_state=2)

In [34]:
x_train.shape, x_test.shape,y_train.shape

((4457,), (1115,), (4457,))

Feature Extraction

In [35]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [36]:
# convert the Y train and Y test to int

In [37]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [39]:
print(x_train_features)

  (0, 4334)	0.42941702167641554
  (0, 3958)	0.6161071828926097
  (0, 6586)	0.44333254982109394
  (0, 6927)	0.48935591439341625
  (1, 2121)	0.3573617143022146
  (1, 1428)	0.5869421390016223
  (1, 6971)	0.42812434651556874
  (1, 3168)	0.5869421390016223
  (2, 5115)	0.3408491178137899
  (2, 7353)	0.31988118061968496
  (2, 3852)	0.3408491178137899
  (2, 4884)	0.35749230587184955
  (2, 5695)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5894)	0.35749230587184955
  (2, 1876)	0.28751725124107325
  (2, 6878)	0.35749230587184955
  (3, 197)	0.36522237107066735
  (3, 3723)	0.16297045459835785
  (3, 2435)	0.26698378141852
  (3, 1825)	0.26858331513730566
  (3, 5231)	0.2266831802864503
  (3, 300)	0.2915969875465198
  (3, 7248)	0.23571908490908416
  (3, 5005)	0.3169028431039865
  :	:
  (4454, 2244)	0.2526916142542512
  (4454, 666)	0.28653660324238944
  (4454, 1575)	0.20946314330145205
  (4454, 1094)	0.24862733340971144
  (4454, 5068)	0.22284357632450164
  (4454, 311)	0.19547195974237946
  

Training the model

In [40]:
model = LogisticRegression()

In [41]:
model.fit(x_train_features,y_train)

Prediction on training data

In [42]:
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(prediction_on_training_data,y_train)
print(f'The accuracy score on training data:{accuracy_on_training_data}')

The accuracy score on training data:0.9683643706529056


In [43]:
prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(prediction_on_test_data,y_test)
print(f'The accuracy score on test data:{accuracy_on_test_data}')

The accuracy score on test data:0.9524663677130045


Predictive System

In [44]:
input_mail=["I HAVE A DATE ON SUNDAY WITH WILL!!"]

#convert text to feature vectors
input_mail_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_mail_features)
print(prediction)

[1]


In [45]:
if prediction[0]==1:
    print('Ham mail')
else:
    print('Spam mail')

Ham mail
