In [5]:
import numpy as np
import pandas as pd

In [6]:
df= pd.read_csv('dataset.csv')

In [7]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [8]:
df.shape

(5171, 4)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [10]:
#check for missing value
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [11]:
#check for duplicate values
df.duplicated().sum()

0

In [12]:
# percentage of data ham and spam 
df['label'].value_counts()

ham     3672
spam    1499
Name: label, dtype: int64

# data preprocessing
##### Lower case 
###### Tokenization 
##### remove special char 
##### remove stop words and punctuation
##### stemming

In [13]:
import nltk
import string 



In [17]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
 
ps = PorterStemmer()

In [163]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y= []  # removing special char
    for i in text: 
        if i.isalnum(): 
            y.append(i)
            
    text= y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:  # stemming 
        y.append(ps.stem(i))
    
           
    return " ".join(y)


In [164]:
df['transformed_text'] = df['text'].apply(transform_text)

In [165]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,transformed_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter 988291 follow not...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom januari 9 2001 see attach file...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonder ti...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop window offic cheap main tren...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject indian spring deal book teco pvr reven...


# Model Building 

In [21]:
# we will make naive base model bcz it work well on textual data

In [128]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf= TfidfVectorizer()

In [129]:
x= tfidf.fit_transform(df['transformed_text']).toarray()

In [130]:
x.shape

(5171, 42592)

In [131]:
y= df['label_num'].values

In [132]:
y

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [133]:
y.shape

(5171,)

In [134]:
from sklearn.model_selection import train_test_split

In [135]:
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size= 0.2,random_state=2)

In [136]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [137]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [138]:
gnb.fit(x_train, y_train)
y_pred1= gnb.predict(x_test)
print(accuracy_score(y_test, y_pred1))
print(precision_score(y_test, y_pred1))

0.9516908212560387
0.9023569023569024


In [139]:
mnb.fit(x_train, y_train)
y_pred2= mnb.predict(x_test)
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test, y_pred2))

0.9227053140096618
[[746   0]
 [ 80 209]]
1.0


In [140]:
bnb.fit(x_train, y_train)
y_pred3= bnb.predict(x_test)
print(accuracy_score(y_test, y_pred3))
print(precision_score(y_test, y_pred3))

0.851207729468599
0.8813559322033898


In [114]:
# Now we will implement some more machine learning model 

In [146]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier


In [147]:
rfc= RandomForestClassifier(n_estimators=80, random_state=2)
etc= ExtraTreesClassifier(n_estimators=80, random_state=2)


In [148]:
rfc.fit(x_train, y_train)
y_pred4= rfc.predict(x_test)
print(accuracy_score(y_test, y_pred4))
print(precision_score(y_test, y_pred4))

0.9864734299516909
0.959866220735786


In [149]:
etc.fit(x_train, y_train)
y_pred5= etc.predict(x_test)
print(accuracy_score(y_test, y_pred5))
print(precision_score(y_test, y_pred5))

0.9864734299516909
0.959866220735786


In [150]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()

In [151]:
knc.fit(x_train, y_train)
y_pred6= knc.predict(x_test)
print(accuracy_score(y_test, y_pred6))
print(precision_score(y_test, y_pred6))

0.9671497584541063
0.9811320754716981


In [153]:
#voting classifier 
from sklearn.ensemble import VotingClassifier

In [154]:
voting = VotingClassifier(estimators=[('knn', knc), ('nb', mnb), ('et', etc)],voting='soft')


In [155]:
voting.fit(x_train,y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier()),
                             ('nb', MultinomialNB()),
                             ('et',
                              ExtraTreesClassifier(n_estimators=80,
                                                   random_state=2))],
                 voting='soft')

In [156]:
y_pred = voting.predict(x_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

Accuracy 0.9816425120772947
Precision 0.9891304347826086


In [157]:
# Applying stacking
estimators=[('knn', knc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()

In [158]:
from sklearn.ensemble import StackingClassifier

In [159]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [161]:
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

Accuracy 0.9864734299516909
Precision 0.9661016949152542


In [162]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))