<a href="https://colab.research.google.com/github/Bunny825/Spam-mail-prediction/blob/main/spam_mail_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the dependencies

In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

Data Extraction and preprocessing

In [2]:
mail_data=pd.read_csv("/mail_data.csv")
mail_data.shape

(5572, 2)

In [3]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail_data["Category"].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [42]:
#that's an unbalanced datasat so let's balance it now
#Divide the samples of spam and ham separetely
spam=mail_data[mail_data.Category=="spam"]
ham=mail_data[mail_data.Category=="ham"]
print(spam.shape)
print(ham.shape)

(747, 2)
(4825, 2)


In [43]:
#sample the ham data restricting to the number of sapmples approx equal to spam samples
ham=ham.sample(n=750)
print(ham.shape)
print(spam.shape)
#they look more balanced now so we may get better predictions and training advantage

(750, 2)
(747, 2)


In [12]:
#so now we have two different dataframes one is ham and another is spam with their name signifying values in it
#concat both the dataframes into a single dataframe using pandas dataframe
new_mail_data=pd.concat([spam,ham],axis=0)
print(new_mail_data.shape)

(1497, 2)


In [13]:
print(new_mail_data.head())

   Category                                            Message
2      spam  Free entry in 2 a wkly comp to win FA Cup fina...
5      spam  FreeMsg Hey there darling it's been 3 week's n...
8      spam  WINNER!! As a valued network customer you have...
9      spam  Had your mobile 11 months or more? U R entitle...
11     spam  SIX chances to win CASH! From 100 to 20,000 po...


In [44]:
#changing the textual data of Category column to binary labels
encoder=LabelEncoder()
new_mail_data["Category"]=encoder.fit_transform(new_mail_data["Category"])

In [45]:
new_mail_data["Category"].value_counts()
#spam-->1
#ham-->0

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
0,750
1,747


In [46]:
#converting all the textual data in the body of mail to numeric values based on the Tfidf values of those
#The score is based on the number of times the word appears in the line as well as in the document.
#min_df says that a word's score should be atleast the given value if lesser than that then just ignore it.
#it signfies that the word has repeated very less number of times
#stop_words:Including this attribute says to avoid the english stopwords in the data such as I,you,we,me,has,had etc..
#lowercase:this says to reduce all the text to lowercase so "Rama" and "RAMA" will have same idf score

x=new_mail_data["Message"]
y=new_mail_data["Category"]
extraction=TfidfVectorizer(min_df=1,stop_words="english",lowercase=True)
x=extraction.fit_transform(x)
print(x)
print(y)

  (0, 76)	0.24242874768506065
  (0, 879)	0.1639398359931388
  (0, 3217)	0.16287280356015804
  (0, 4006)	0.10941783560586357
  (0, 3682)	0.19583719831454716
  (0, 3186)	0.19583719831454716
  (0, 3247)	0.1639398359931388
  (0, 697)	0.22526056325257834
  (0, 3850)	0.11901517526945986
  (0, 377)	0.2327601276159283
  (0, 386)	0.24242874768506065
  (0, 3917)	0.22526056325257834
  (0, 1792)	0.1841310636023145
  (0, 1438)	0.2139521692606413
  (0, 1738)	0.4848574953701213
  (0, 4236)	0.140106310657879
  (0, 1340)	0.19876153745961425
  (0, 4260)	0.1883376339511972
  (0, 1686)	0.350931525548715
  (0, 1860)	0.10557843871858724
  (1, 3224)	0.32858142780878835
  (1, 545)	0.1969182613310201
  (1, 3458)	0.176252663134934
  (1, 1265)	0.34705130798386635
  (1, 4317)	0.2523279018562539
  :	:
  (1492, 2305)	0.22737334868509687
  (1492, 3465)	0.3542862085123131
  (1493, 1500)	0.3620775906714282
  (1493, 4154)	0.3620775906714282
  (1493, 4274)	0.29619449330825387
  (1493, 2888)	0.3185312319801384
  (1493, 2

In [52]:
#spliting the data for training and testing purpose

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=3,test_size=0.2,stratify=y)
print(x.shape,x_train.shape,x_test.shape)

(1497, 4362) (1197, 4362) (300, 4362)


In [53]:
#we are using the Logistic regression as it is a binary classification problem

model=LogisticRegression()
model.fit(x_train,y_train)

train_predict=model.predict(x_train)
train_accuracy=accuracy_score(train_predict,y_train)
print(train_accuracy)

test_predict=model.predict(x_test)
test_accuracy=accuracy_score(test_predict,y_test)
print(test_accuracy)

#we got the best accuracy for both training and testing data so no case of underfit


0.9891395154553049
0.95


In [40]:
#prediction function

def mail_predict(input):
  input=extraction.transform(input)
  output=model.predict(input)
  if output==0:
    print("Ham Mail")
  else:
    print("Spam Mail")

In [41]:
inp1=["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]
inp2=["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."]
mail_predict(inp1)
mail_predict(inp2)

Spam Mail
Ham Mail
