In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
raw_data=pd.read_csv("mail_data.csv")
raw_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
#replacing the null values with a null string
mail_dataset=raw_data.where((pd.notnull(raw_data)),'')

In [5]:
mail_dataset

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
#There are 2 classes of mail,"spam" and "ham"; using label encoding to mark spam mail as '0' and ham mail as '1'
mail_dataset.loc[mail_dataset['Category']=='spam','Category',]=0
mail_dataset.loc[mail_dataset['Category']=='ham','Category',]=1

In [7]:
#separating the data as "Message" and "Category" (X and Y) respectively,to train the dataset for Logistic Regression
X=mail_dataset['Message']
Y=mail_dataset['Category']

In [8]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [9]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [10]:
#Splitting the data into training data and test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [11]:
X.shape

(5572,)

In [12]:
X_train.shape

(4457,)

In [13]:
X_test.shape

(1115,)

In [14]:
#Converting the text values of "Message" into numerical values
#Or converting the text data into feature vectors(to use them as a input for Logistic Regression)

feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

In [15]:
#Converting the X features
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

In [16]:
#Coverting the Y values to integers(as sometimes '1' and '0' are treated as strings)

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [18]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [19]:
# X_train_features has completely changed into certain numerical values because of feature scaling.

In [21]:
model=LogisticRegression()

In [22]:
# training the Logistic Regression model using the training data
model.fit(X_train_features,Y_train)

LogisticRegression()

In [23]:
#Evaluating the trained model(to check how accurately it is working)

In [25]:
#Predicting by using the training data
predict_on_trainingdata=model.predict(X_train_features)
accuracy_on_trainingdata=accuracy_score(Y_train,predict_on_trainingdata)

In [28]:
print("The accuracy on the training data is ",(accuracy_on_trainingdata)*100,"%")

The accuracy on the training data is  96.70181736594121 %


In [29]:
#Predicting by using the test data
predict_on_testdata=model.predict(X_test_features)
accuracy_on_testdata=accuracy_score(Y_test,predict_on_testdata)

In [30]:
print("The accuracy on the test data is ",(accuracy_on_testdata)*100,"%")

The accuracy on the test data is  96.59192825112108 %


In [31]:
#Taking as input a random mail and predicting whether it is "spam" or "ham"
input_mail=["India vs England!!don't miss the score.For live updates stay tuned."]

#Converting the input to feature vector
input_datafeatured=feature_extraction.transform(input_mail)

#predicting the mail type
predict_data=model.predict(input_datafeatured)

In [36]:
print("The above mail is",predict_data)

The above mail is [1]


In [40]:
#Taking an input from the training set
#I am knowingly selecting a spam mail
input_mail1=["Fancy a shag? I do.Interested? sextextuk.com txt XXUK SUZY to 69876. Txts cost 1.50 per msg. TnCs on website. X"]
input_datafeatured1=feature_extraction.transform(input_mail1)
predict_data1=model.predict(input_datafeatured1)

In [41]:
print("The above mail is",predict_data1)

The above mail is [0]
