In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [28]:
df = pd.read_excel('Spam Email Detection.xlsx')

In [29]:
print(df)

        v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will ÔøΩ_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN 

In [30]:
data = df.where((pd.notnull(df)),'')

In [31]:

data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [32]:
data.loc[data['v1']=='spam','v1',]=0
data.loc[data['v1']=='ham','v1',]=1

In [33]:
y= data['v1']
x= data['v2']



In [34]:
print(x)


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will ÔøΩ_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object


In [35]:
print(y)


0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: object


In [36]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=3)

In [37]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(5572,)
(4457,)
(1115,)


In [38]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


In [39]:
x_train = [str(doc) for doc in x_train]
x_test = [str(doc) for doc in x_test]
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase=True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [40]:
print(x_train_features)

  (0, 742)	0.32207229533730536
  (0, 3962)	0.2411608243124387
  (0, 4279)	0.3893042361045832
  (0, 6580)	0.20305518394534605
  (0, 3375)	0.32207229533730536
  (0, 2116)	0.38519642807943744
  (0, 3126)	0.4403035234544808
  (0, 3251)	0.258880502955985
  (0, 3369)	0.21816477736422235
  (0, 4497)	0.2910887633154199
  (1, 4045)	0.380431198316959
  (1, 6850)	0.4306015894277422
  (1, 6397)	0.4769136859540388
  (1, 6422)	0.5652509076654626
  (1, 7420)	0.35056971070320353
  (2, 934)	0.4917598465723273
  (2, 2103)	0.42972812260098503
  (2, 3899)	0.40088501350982736
  (2, 2220)	0.413484525934624
  (2, 5806)	0.4917598465723273
  (3, 6121)	0.4903863168693604
  (3, 1595)	0.5927091854194291
  (3, 1838)	0.3708680641487708
  (3, 7430)	0.5202633571003087
  (4, 2523)	0.7419319091456392
  :	:
  (4452, 2116)	0.3092200696489299
  (4453, 1000)	0.6760129013031282
  (4453, 7250)	0.5787739591782677
  (4453, 1758)	0.45610005640082985
  (4454, 3019)	0.42618909997886
  (4454, 2080)	0.3809693742808703
  (4454, 3078

In [41]:
model = LogisticRegression()

In [42]:
model.fit(x_train_features, y_train)

In [43]:
prediction_on_training_data = model.predict(x_train_features)
training_data_accuracy = accuracy_score(y_train, prediction_on_training_data)

In [44]:
print('Accuracy of training data: ',training_data_accuracy)

Accuracy of training data:  0.9661207089970832


In [45]:
prediction_on_test_data = model.predict(x_test_features)
test_data_accuracy = accuracy_score(y_test, prediction_on_test_data)


In [46]:
print('Accuracy of test data: ',test_data_accuracy)

Accuracy of test data:  0.9623318385650225


In [47]:
mail_input = ["Dear Akash Nivaria,Here's a confirmation that you've chosen to delete your Snapchat account akashnivaria.For now, your Snapchat account has been deactivated. In 30 days, your account will be deleted. If you'd like to reactivate your account before that happens, just log in to Snapchat üòâHope to see you again soon! üëèTeam Snapchat"]

input_data_features = feature_extraction.transform(mail_input)

prediction = model.predict(input_data_features)


if (prediction==0):
    print("The given mail is a Spam!!")
else:
    print("The given mail is not a Spam!!")



The given mail is not a Spam!!


In [48]:
mail_input = ["Dear Valued Customer,Congratulations! You've been selected as the winner of our exclusive vacation giveaway! You and a guest will enjoy a luxurious, all-expenses-paid vacation to a tropical paradise.But that's not all! As a bonus, you'll also receive a free 3-night stay at a 5-star resort and complimentary airfare. This is a once-in-a-lifetime opportunity that you won't want to miss!To claim your prize, simply click on the link below and provide your contact information:[Link to Claim Your Prize]Hurry, this offer is only available for a limited time! Don't miss out on your chance to experience the vacation of your dreams.Best Regards,The Vacation Giveaway Team"]

input_data_features = feature_extraction.transform(mail_input)

prediction = model.predict(input_data_features)


if (prediction==0):
    print("The given mail is a Spam!!")
else:
    print("The given mail is not a Spam!!")

The given mail is a Spam!!
