Importing the dependencies

In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
# importing mail data to a pandas dataset
mail_data = pd.read_csv('../../ML/Datasets/mail_data.csv')

In [35]:
# # replace null values with null string
# mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [36]:
# viewing first 10 dataset in mail_data
mail_data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [37]:
# checking the dimensions of the data
mail_data.shape

(5572, 2)

In [38]:
# checking information
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [39]:
# checking if there is null or na values
print(mail_data.isnull().sum())
print(mail_data.isna().sum())

Category    0
Message     0
dtype: int64
Category    0
Message     0
dtype: int64


In [40]:
# number of spam and ham mail
mail_data['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

Label Encoding

0 --> Ham

1 --> Spam

In [41]:
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 1

In [42]:
mail_data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Separating features and labels

In [43]:
x = mail_data['Message']
y = mail_data['Category']

In [44]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [45]:
print(y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: object


Splitting the data in train and test data

In [46]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=3
)

In [47]:
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4457,) (1115,)


In [48]:
print(x_train)
print(x_test)

2020               From tomorrow onwards eve 6 to 3 work.
3453                             Ugh just got outta class
3364                                   Can... I'm free...
2597    No i'm not gonna be able to. || too late notic...
5491    U studying in sch or going home? Anyway i'll b...
                              ...                        
3775                                Ok... But bag again..
5519    Can you pls send me that company name. In saib...
2832                           Thanx 4 sending me home...
2724    Tunde, how are you doing. This is just wishing...
4714    Big brother‘s really scraped the barrel with t...
Name: Message, Length: 4457, dtype: object
2297    <Forwarded from 21870000>Hi - this is your Mai...
1613    RT-KIng Pro Video Club>> Need help? info@ringt...
4531    Don't forget though that I love you .... And I...
920                 Dont talk to him ever ok its my word.
2328    URGENT! Your mobile number *************** WON...
                             

converting text to numbers by vectorization

In [49]:
vectorizer = TfidfVectorizer(min_df= 1, stop_words="english", lowercase= True)

In [50]:
text_to_numbers_train = vectorizer.fit_transform(x_train)
text_to_numbers_test = vectorizer.transform(x_test)

# convert y_train and y_test values as integers
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [51]:
x_train = text_to_numbers_train
x_test = text_to_numbers_test

In [52]:
print(x_train)

  (0, 7329)	0.39151450331197035
  (0, 2596)	0.5157331716075019
  (0, 4795)	0.6459507464707183
  (0, 6736)	0.40433070936297943
  (1, 1793)	0.43486660333673016
  (1, 4861)	0.596185515774092
  (1, 3112)	0.31103507183699425
  (1, 3758)	0.2826422333927384
  (1, 6887)	0.528038275197618
  (2, 2903)	1.0
  (3, 5081)	0.4169087023760639
  (3, 7198)	0.3971508483254661
  (3, 3373)	0.26859638268284747
  (3, 4040)	0.24099748417300504
  (3, 4692)	0.43001182720880177
  (3, 3909)	0.3260348921371232
  (3, 758)	0.37620667903348365
  (3, 3092)	0.32479862316475455
  (4, 3911)	0.2511783165875194
  (4, 3082)	0.4766800108257892
  (4, 5766)	0.6833422922401592
  (4, 6339)	0.37251069778964124
  (4, 3373)	0.23999265394731062
  (4, 4040)	0.21533285461106833
  (5, 4413)	0.4460096390714086
  :	:
  (4452, 1180)	0.8777703340143531
  (4452, 4770)	0.4790816639408472
  (4453, 1853)	0.5659242420057378
  (4453, 5704)	0.5659242420057378
  (4453, 1891)	0.4268643677817285
  (4453, 5098)	0.31370317391845537
  (4453, 5831)	0.280

In [53]:
print(x_test)

  (0, 6069)	0.17898756886133724
  (0, 5592)	0.27040646184211636
  (0, 4334)	0.27040646184211636
  (0, 4333)	0.4043741386311273
  (0, 4258)	0.48391496454894295
  (0, 4199)	0.27040646184211636
  (0, 3326)	0.15053858929369243
  (0, 2885)	0.2315106978077796
  (0, 900)	0.25781758783662617
  (0, 356)	0.27040646184211636
  (0, 355)	0.2488856395843301
  (0, 170)	0.27040646184211636
  (1, 7381)	0.195283181583766
  (1, 7046)	0.24076699187385958
  (1, 6893)	0.41865401615879083
  (1, 4589)	0.17770109251757912
  (1, 3826)	0.2982254780251209
  (1, 3563)	0.26502407200930533
  (1, 3306)	0.22105285481388076
  (1, 2541)	0.23025505509842506
  (1, 2034)	0.2921850764758712
  (1, 1822)	0.5489480368150532
  (1, 312)	0.22197489844787852
  (2, 7156)	0.33671672463654934
  (2, 7140)	0.41087739018331193
  :	:
  (1112, 4597)	0.23602727834779066
  (1112, 4129)	0.5588600780072547
  (1112, 3173)	0.5581704944150844
  (1112, 2167)	0.173625894465186
  (1112, 2033)	0.2250389400930859
  (1112, 1292)	0.23602727834779066
  

Using Logistic Regression to train the module

In [54]:
model = LogisticRegression()

In [55]:
model.fit(x_train,y_train)

In [56]:
x_train_prediction = model.predict(x_train)
train_data_accuracy  = accuracy_score(x_train_prediction,y_train)

In [57]:
print("Accuracy for training data: ",train_data_accuracy*100,"%")

Accuracy for training data:  96.7915638321741 %


In [58]:
x_test_prediction = model.predict(x_test)
test_data_accuracy  = accuracy_score(x_test_prediction,y_test)

In [59]:
print("Accuracy for test data: ",test_data_accuracy*100,"%")

Accuracy for test data:  96.32286995515696 %


Creating a prediction application

In [93]:
# taking input
input_text = [
    "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
]

# converting to numbers
input_data = vectorizer.transform(input_text)


prediction = model.predict(input_data)

In [94]:
if(prediction[0] == 0):
    print(prediction)
    print("The mail is ham")
else:
    print(prediction)
    print("The mail is spam")

[1]
The mail is spam
