 ## Step 1: Perform imports and load the dataset into a pandas DataFrame


In [132]:
import pandas as pd
import numpy as np

In [133]:
#Read a data
data=pd.read_csv("spam.csv", encoding="latin-1")

In [134]:
data.head()

Unnamed: 0,Class,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [135]:
data.columns

Index(['Class', 'Message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [136]:
# drop the last 3 columns
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [137]:
data.head(4)

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [138]:
#Check the missing values
data.isnull().sum()

Class      0
Message    0
dtype: int64

In [139]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [140]:
data['Class'] = encoder.fit_transform(data['Class'])

In [141]:
data.head(5)

Unnamed: 0,Class,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [142]:
data['Class'].value_counts()

0    4825
1     747
Name: Class, dtype: int64

In [143]:
# check for duplicate values
data.duplicated().sum()

403

In [144]:
# remove duplicates
data = data.drop_duplicates(keep='first')

In [145]:
data.duplicated().sum()

0

In [146]:
#nbr of cl an l
data.shape

(5169, 2)

## Step2 : Data Preprocessing

In [147]:
#import nltk package
import nltk
import string

In [148]:
!pip install nltk



In [149]:
# Punkt Tokenizer Models
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Fatima-Zahra
[nltk_data]     Naciri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [150]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps=PorterStemmer()


[nltk_data] Downloading package stopwords to C:\Users\Fatima-Zahra
[nltk_data]     Naciri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [151]:
def transform_Message(Message):
    #Convert to lower case
    Message = Message.lower()
    #tokenizinf the message into words
    Message = nltk.word_tokenize(Message)
    
   
   #Removing  alphanumeric characters from the tokenized words
    y = []
    for i in Message:
        if i.isalnum():
            y.append(i)
    
    Message = y[:]
    y.clear()
    
    #All the stop words and punctuation are removed from the list y using the not in operator.
    for i in Message:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    Message = y[:]
    y.clear()
    
    #The words in the list y are stemmed using the ps.stem() method.
    for i in Message:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [152]:
transform_Message("Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030.")

'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030'

In [153]:
data['Message'][10]

"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [154]:
ps.stem('mobile')

'mobil'

In [155]:
data['transformed_Message'] = data['Message'].apply(transform_Message)

In [156]:
data.head(4)

Unnamed: 0,Class,Message,transformed_Message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say


## Step3:Model Building

In [157]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [158]:
X = tfidf.fit_transform(data['transformed_Message']).toarray()

In [159]:
X.shape

(5169, 3000)

In [160]:
y = data['Class'].values

In [161]:
from sklearn.model_selection import train_test_split

In [162]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [163]:
len(X_train)

4135

In [164]:
len(X_test)

1034

In [165]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [166]:
mnb = MultinomialNB()

In [167]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

0.9709864603481625
[[896   0]
 [ 30 108]]
1.0


In [168]:
clfs = {'NB':mnb}

In [1]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    print("Pourcentage de précision : {:.2f}%".format(accuracy * 100))
    
    return accuracy,precision

In [170]:
train_classifier(mnb,X_train,y_train,X_test,y_test)

(0.9709864603481625, 1.0)

In [171]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

For  NB
Accuracy -  0.9709864603481625
Precision -  1.0


In [172]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [173]:
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision
0,NB,0.970986,1.0


In [174]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

In [175]:
performance_df1

Unnamed: 0,Algorithm,variable,value
0,NB,Accuracy,0.970986
1,NB,Precision,1.0


In [176]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))