In [1]:
import numpy as np  # numpy arrays  manipulation
import pandas as pd   #data analysis
import nltk   # python library for nlp tasks
import re  #python module to check and fetch patterns in text
from nltk.corpus import stopwords   # stop words are less important words
from nltk.stem.porter import PorterStemmer     # algorithm to reduce words to root form by removing suffix
from sklearn.feature_extraction.text import TfidfVectorizer   #converting text into numerical vectors
from sklearn.model_selection import train_test_split  #spliting data into train and test parts
from sklearn.linear_model import LogisticRegression  # model for classification
from sklearn.metrics import accuracy_score  #metrics to evaluate training and testing accuracy

In [2]:
nltk.download("stopwords")   # downloading stopwords available in nltk library

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
print(stopwords.words("english"))   # printing downloadded stopwords

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
df=pd.read_csv('/content/spam_and_ham_classification.csv')  #dataframe to show data set in tabular format

In [5]:
df.head()  # default method to show first 5 rows of data set

Unnamed: 0,label,text
0,ham,into the kingdom of god and those that are ent...
1,spam,there was flow at hpl meter 1505 on april firs...
2,ham,take a look at this one campaign for bvyhprice...
3,spam,somu wrote actually thats what i was looking f...
4,spam,fathi boudra wrote i fixed the issue in the sv...


In [6]:
df.isnull().sum()

Unnamed: 0,0
label,0
text,0


In [7]:
df['label']=df['label'].map({'ham':0,'spam':1})

In [8]:
print(df['label'])

0       0
1       1
2       0
3       1
4       1
       ..
7597    0
7598    0
7599    0
7600    1
7601    1
Name: label, Length: 7602, dtype: int64


In [9]:
print(type(df['text']))

<class 'pandas.core.series.Series'>


In [10]:
port_stem= PorterStemmer()  # creating variable and storing stemming function inside it

def stemming(text):  # creating function stemming on feature=text
  if isinstance(text, list):
        text = ' '.join(text)
  stemmed_text = re.sub('[^a-zA-Z]',' ',text)  # re.sub function will substitute anything other then upper and lower alphabets with a space
  stemmed_text = stemmed_text.lower()  # lower function will lower any upper case letter if present
  stemmed_text = stemmed_text.split()  #split function will split text into words or tokens
  stemmed_text = [port_stem.stem(word) for word in stemmed_text if not word in stopwords.words("english")]  #calling stemming function and a loop that removes stop words
  stemmed_text = ' '.join(stemmed_text)  #joining the stemmed text back into a single string with spaces
  return stemmed_text

In [11]:
df['text']=df['text'].apply(stemming)

In [12]:
print(df['text'])

0       kingdom god enter lord pardon escapenumb us th...
1       flow hpl meter april first deal ticket deal ti...
2       take look one campaign bvyhpric escapenumb esc...
3       somu wrote actual that look l r user enter str...
4       fathi boudra wrote fix issu svn repo rev escap...
                              ...                        
7597    ancient volum sir richard read day die lain ae...
7598    dear valu member buy drug web frequent pleas r...
7599    present us licens onlin pharmescapenumberci st...
7600    parrot bug summari http rt perl org rtescapenu...
7601    want repli left vacat thunderbird crash take s...
Name: text, Length: 7602, dtype: object


In [13]:
x=df.drop(columns='label',axis=1)
y=df['label']

In [14]:
print(x)
print(y)

                                                   text
0     kingdom god enter lord pardon escapenumb us th...
1     flow hpl meter april first deal ticket deal ti...
2     take look one campaign bvyhpric escapenumb esc...
3     somu wrote actual that look l r user enter str...
4     fathi boudra wrote fix issu svn repo rev escap...
...                                                 ...
7597  ancient volum sir richard read day die lain ae...
7598  dear valu member buy drug web frequent pleas r...
7599  present us licens onlin pharmescapenumberci st...
7600  parrot bug summari http rt perl org rtescapenu...
7601  want repli left vacat thunderbird crash take s...

[7602 rows x 1 columns]
0       0
1       1
2       0
3       1
4       1
       ..
7597    0
7598    0
7599    0
7600    1
7601    1
Name: label, Length: 7602, dtype: int64


In [15]:
print(x.shape)
print(y.shape)

(7602, 1)
(7602,)


In [16]:
X=df['text'].values
Y=df['label'].values

In [17]:
print(df['text'])

0       kingdom god enter lord pardon escapenumb us th...
1       flow hpl meter april first deal ticket deal ti...
2       take look one campaign bvyhpric escapenumb esc...
3       somu wrote actual that look l r user enter str...
4       fathi boudra wrote fix issu svn repo rev escap...
                              ...                        
7597    ancient volum sir richard read day die lain ae...
7598    dear valu member buy drug web frequent pleas r...
7599    present us licens onlin pharmescapenumberci st...
7600    parrot bug summari http rt perl org rtescapenu...
7601    want repli left vacat thunderbird crash take s...
Name: text, Length: 7602, dtype: object


In [18]:
vectorizer= TfidfVectorizer()
vectorizer.fit(X)

In [19]:
X=vectorizer.transform(X)

In [20]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 664237 stored elements and shape (7602, 59538)>
  Coords	Values
  (0, 1710)	0.07565900171035578
  (0, 4885)	0.062182903589414265
  (0, 5844)	0.09574886332872497
  (0, 7682)	0.13271775787701093
  (0, 7709)	0.06902455085291491
  (0, 9084)	0.0984966221769521
  (0, 9092)	0.09632068715432084
  (0, 9659)	0.06412013107903154
  (0, 10097)	0.05410421569195367
  (0, 10288)	0.06025961404450881
  (0, 13231)	0.06848847267820109
  (0, 13615)	0.0915602344902293
  (0, 16789)	0.07227033746903115
  (0, 17260)	0.38337278601586683
  (0, 17586)	0.14401280390607119
  (0, 17953)	0.11481553347703104
  (0, 18162)	0.05344162669353637
  (0, 18337)	0.10649147316909
  (0, 19787)	0.09952614176802105
  (0, 20205)	0.09200067500169372
  (0, 21706)	0.11309865154000964
  (0, 22146)	0.1611887955539444
  (0, 22519)	0.10100855481246143
  (0, 23091)	0.09364331851011624
  (0, 23493)	0.13568874359813016
  :	:
  (7601, 48570)	0.0421163443831929
  (7601, 48591)	0.036

Spliting the data into train test split

In [21]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(6081, 59538)
(1521, 59538)
(6081,)
(1521,)


Training the model

In [23]:
model= LogisticRegression()
model.fit(X_train,Y_train)

Evaluating model

In [24]:
# Evaluating training accuracy

In [25]:
X_train_prediction= model.predict(X_train)
train_accuracy= accuracy_score(X_train_prediction,Y_train)
print(train_accuracy)

0.9843775694787041


In [26]:
# Evaluating test accuracy

In [27]:
X_test_prediction= model.predict(X_test)
test_accuracy= accuracy_score(X_test_prediction,Y_test)
print(test_accuracy)

0.9631821170282708


Building a predictive system

In [28]:
df['text'].iloc[2]

'take look one campaign bvyhpric escapenumb escapenumb escapenumb day target price escapenumb escapenumbermarket hellish sym gain momentum see news theescapenumb call broker'

In [29]:
input_data=["into the kingdom of god and those that are entering in he lord pardon escapenumber us in this thing we pray thee have us excused escapenumbernot therefore o escapenumber believers to look into the grave for to you it the holy ghost and escapenumber therefore being a holy habitation unto forth the words of truth and escapenumber soberness escapenumber but in all probability hath made them mad escapenumber and though escapenumber blessed be god all do that of jesus christ they see the necessity of escapenumber closing with a more than almost christians but is heaven so small a escapenumber trifle in men's esteem as not to be total renovation of the whole man escapenumber by the righteousness of complete though we be delivered from the power escapenumber we are not candle of the lord shines out and your redeemer lifts up the escapenumber inward holiness as indeed sometimes they do though in a worms destroy escapenumber them yet even in their flesh shall they see a mind to see jesus but escapenumber then they cannot come to him jeannine walsh"]
cleaned_input= stemming(input_data)
input_vector= vectorizer.transform([cleaned_input])
prediction= model.predict(input_vector)
print(prediction)
if (prediction[0]==1):
  print("Its a Spam Email")
else:
  print("Its a Ham email")

[0]
Its a Ham email
