In [1]:
# Importing required libraries
# numpy to perform mathematical operations
# pandas to perform data manipulation
# train_test_split to split test and train data
# TfidfVectorizer to transform text into a meaningful representation of numbers which is used to fit machine  algorithm for prediction.


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_excel('Spam Email Detection.xlsx')

In [3]:
print(df)

        v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will ÔøΩ_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN 

In [4]:
data = df.where((pd.notnull(df)),'')

In [5]:

data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
#Converting categorical data into numerical data

data.loc[data['v1']=='spam','v1',]=0
data.loc[data['v1']=='ham','v1',]=1

In [7]:
x= data['v2']
y= data['v1']



In [8]:
print(x)


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will ÔøΩ_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object


In [38]:
print(y)


0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: object


In [39]:
# split into train test sets
#train-80
#test-20
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2)

In [40]:
# TfidfVectorizer(min_df=1, stop_words='english', lowercase=True): This line initializes a TF-IDF vectorizer with certain parameters:
# min_df=1: This parameter specifies the minimum number of documents a word must appear in to be included in the vocabulary. In this case, it's set to 1, meaning that even words appearing in just one document will be considered.
# stop_words='english': This parameter tells the vectorizer to remove English stop words (common words like "the", "is", "and", etc.) from the documents, as they often don't contribute much to the meaning.
# lowercase=True: This parameter specifies whether to convert all text to lowercase before processing. This helps in ensuring that words with different cases (e.g., "hello" and "Hello") are treated as the same word.

In [41]:
x_train = [str(doc) for doc in x_train]
x_test = [str(doc) for doc in x_test]

feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase=True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [42]:
print(x_train_features)

  (0, 1999)	0.31664107671486014
  (0, 1148)	0.23599474522296268
  (0, 4511)	0.20399825508486763
  (0, 2965)	0.3394229538383472
  (0, 3659)	0.2582918550878144
  (0, 2229)	0.21530846863003025
  (0, 1548)	0.33254811889871355
  (0, 4524)	0.285727995692045
  (0, 2459)	0.3049401162371403
  (0, 4935)	0.21093259390082705
  (0, 981)	0.33254811889871355
  (0, 7074)	0.22560642897889022
  (0, 3211)	0.2880155816619519
  (1, 1858)	0.3246554218431271
  (1, 4801)	0.3489204238016836
  (1, 5136)	0.46548739053612515
  (1, 4949)	0.2939382140995421
  (1, 2820)	0.40390263350382505
  (1, 6408)	0.3716495252558365
  (1, 4163)	0.4105051808339837
  (2, 3251)	0.7788686640166792
  (2, 6047)	0.6271870567963542
  (3, 4339)	0.12918564009574465
  (3, 7098)	0.13301588261694494
  (3, 535)	0.15224388589945037
  :	:
  (4454, 2496)	0.2261282533680302
  (4454, 3174)	0.2487865915084544
  (4454, 1678)	0.21883389773716977
  (4454, 6632)	0.14652793486281177
  (4454, 4665)	0.19961969708641403
  (4454, 2855)	0.13706427763233958
 

In [43]:
#Importing Logistic Regression

from sklearn.linear_model import LogisticRegression


In [None]:
#Creating a model

model = LogisticRegression()

In [44]:
#Passing the parameters that is input and output to the model

model.fit(x_train_features, y_train)

In [45]:
#Lets make it simple, understand preiction_on_training_data as prediction
# x_train_features as input data
# y_train as expected outcome 
# training_data_accuracy as accuracy score between expected outcome and input of train data


prediction_on_training_data = model.predict(x_train_features)
training_data_accuracy = accuracy_score(y_train, prediction_on_training_data)

In [46]:
#Printing Training Data Accuracy 

print('Accuracy of training data: ',training_data_accuracy)

Accuracy of training data:  0.9670181736594121


In [47]:
#Here x_test_features is input
# prediction_on_test_data is prediction
# y_test as expected outcome
#test_data_accuracy is accuracy score between expected outcome and input of test data

prediction_on_test_data = model.predict(x_test_features)
test_data_accuracy = accuracy_score(y_test, prediction_on_test_data)


In [48]:
#Printing Test Data Accuracy 

print('Accuracy of test data: ',test_data_accuracy)

Accuracy of test data:  0.9587443946188341


In [49]:
mail_input = ["Dear Akash Nivaria,Here's a confirmation that you've chosen to delete your Snapchat account akashnivaria.For now, your Snapchat account has been deactivated. In 30 days, your account will be deleted. If you'd like to reactivate your account before that happens, just log in to Snapchat üòâHope to see you again soon! üëèTeam Snapchat"]

input_data_features = feature_extraction.transform(mail_input)

prediction = model.predict(input_data_features)


if (prediction==0):
    print("The given mail is a Spam!!")
else:
    print("The given mail is not a Spam!!")



The given mail is not a Spam!!


In [50]:
mail_input = ["Dear Valued Customer,Congratulations! You've been selected as the winner of our exclusive vacation giveaway! You and a guest will enjoy a luxurious, all-expenses-paid vacation to a tropical paradise.But that's not all! As a bonus, you'll also receive a free 3-night stay at a 5-star resort and complimentary airfare. This is a once-in-a-lifetime opportunity that you won't want to miss!To claim your prize, simply click on the link below and provide your contact information:[Link to Claim Your Prize]Hurry, this offer is only available for a limited time! Don't miss out on your chance to experience the vacation of your dreams.Best Regards,The Vacation Giveaway Team"]

input_data_features = feature_extraction.transform(mail_input)

prediction = model.predict(input_data_features)


if (prediction==0):
    print("The given mail is a Spam!!")
else:
    print("The given mail is not a Spam!!")

The given mail is a Spam!!
