In [75]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

Data Collection and Pre-processing

In [76]:
# Loading the dataset from a CSV file into a Pandas DataFrame
df = pd.read_csv(r"C:\Users\WAGHMARE\Downloads\archive (5)\spam.csv",encoding='ISO-8859-1',header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [77]:
df.columns
#There are unwanted columns of NaN in our datasets .We will drop those columns 

Index([0, 1, 2, 3, 4], dtype='int64')

In [78]:
#Droping the extra columns
df = df.drop([2,3,4], axis=1)

In [104]:
#Printing the first 5 rows of the data
df.head()

Unnamed: 0,category,message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [100]:
#Renaming the columns of the data as category and message
df.columns = ['category','message']
df.head()

Unnamed: 0,category,message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  5572 non-null   object
 1   message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [105]:
#Checking the number of rows and columns in the data frame
df.shape

(5572, 2)

In [103]:
#Checking if there is any null values 
df.isnull().sum()

category    0
message     0
dtype: int64

Label Encoding

In [101]:
#label spam mail as 0 and ham mail as 1;
df.loc[df['category']== 'spam' , 'category',]=0
df.loc[df['category']== 'ham' , 'category',]=1

* Spam : 0 
* Ham : 1

In [106]:
#Seperating the data as texts ans=d label
x = df['message']
y = df['category']

In [85]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object


In [86]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: category, Length: 5572, dtype: object


In [87]:
#Splitting the train and test data by using train_test_split (test= 20 %, train = 80%)
x_train , x_test , y_train , y_test = train_test_split(x , y ,test_size = 0.2 , random_state = 3)

In [88]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


In [107]:
#Transform the data to feature vectors that can be used as input to the logistic regression
fea_extraction = TfidfVectorizer(min_df = 1 , stop_words = 'english' , lowercase = True)

x_train_fea = fea_extraction.fit_transform(x_train)
x_test_fea = fea_extraction.transform(x_test)

#Convert y_train & y_test values as integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')


In [108]:
print(x_train)

3075    Mum, hope you are having a great day. Hoping t...
1787                           Yes:)sura in sun tv.:)lol.
1614    Me sef dey laugh you. Meanwhile how's my darli...
4304                Yo come over carlos will be here soon
3266                    Ok then i come n pick u at engin?
                              ...                        
789                          Gud mrng dear hav a nice day
968             Are you willing to go for aptitude class.
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha. Why boss?
Name: message, Length: 4457, dtype: object


In [109]:
print(x_train_fea)

  (0, 741)	0.3219352588930141
  (0, 3979)	0.2410582143632299
  (0, 4296)	0.3891385935794867
  (0, 6599)	0.20296878731699391
  (0, 3386)	0.3219352588930141
  (0, 2122)	0.38613577623520473
  (0, 3136)	0.440116181574609
  (0, 3262)	0.25877035357606315
  (0, 3380)	0.21807195185332803
  (0, 4513)	0.2909649098524696
  (1, 4061)	0.380431198316959
  (1, 6872)	0.4306015894277422
  (1, 6417)	0.4769136859540388
  (1, 6442)	0.5652509076654626
  (1, 7443)	0.35056971070320353
  (2, 933)	0.4917598465723273
  (2, 2109)	0.42972812260098503
  (2, 3917)	0.40088501350982736
  (2, 2226)	0.413484525934624
  (2, 5825)	0.4917598465723273
  (3, 6140)	0.4903863168693604
  (3, 1599)	0.5927091854194291
  (3, 1842)	0.3708680641487708
  (3, 7453)	0.5202633571003087
  (4, 2531)	0.7419319091456392
  :	:
  (4452, 2122)	0.31002103760284144
  (4453, 999)	0.6760129013031282
  (4453, 7273)	0.5787739591782677
  (4453, 1762)	0.45610005640082985
  (4454, 3029)	0.42618909997886
  (4454, 2086)	0.3809693742808703
  (4454, 3088)

Training the model 

Logistic Regression

In [111]:
 model = LogisticRegression()

In [112]:
#Training the Logistic Regression model with the training dtaa 
model.fit(x_train_fea, y_train)

In [113]:
#Evaluating the Training model
#Prediction on training data 
pred_on_train = model.predict (x_train_fea)
accuracy_on_train = accuracy_score(y_train , pred_on_train)

In [95]:
print('Accuracy Of Training data : ', accuracy_on_train) 

Accuracy Of Training data :  0.9661207089970832


In [114]:
#Prediction on test data 
pred_on_test = model.predict(x_test_fea)
accuracy_on_test = accuracy_score(y_test, pred_on_test)

In [115]:
print('Accuracy Of Test data : ', accuracy_on_test) 

Accuracy Of Test data :  0.9623318385650225


Building a predictive System

In [119]:
input_mail=['Thanks for your subscription to Ringtone UK your mobile will be charged å£5/month Please confirm by replying YES or NO. If you reply NO you will not be charged,,,']

input_data_feactures = fea_extraction.transform(input_mail)

prediction = model.predict(input_data_feactures)

if prediction == 0:
    print("spam")
else:
    print("ham")

spam


In [120]:
input_mail=['Fair enough, anything going on?",,,']

input_data_feactures = fea_extraction.transform(input_mail)

prediction = model.predict(input_data_feactures)

if prediction == 0:
    print("spam")
else:
    print("ham")

ham
