In [1]:
import nltk
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.model_selection import train_test_split as tts

Load Data and pick through the data


In [18]:
email_data = pd.read_csv('data/spam.csv',encoding = 'latin1',usecols = ['class','emails'],names = ['class','emails'] ,skiprows = [0])
email_data.head()

Unnamed: 0,class,emails
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Check for null values if any

In [19]:
email_data.isna().sum()

class     0
emails    0
dtype: int64

convert class column from categorial to numerical and put it in Y

In [20]:
Y = email_data['class'].map({'ham':0,'spam':1})
X = email_data['emails']
Y.head()

0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64

split data into training and testing

In [21]:
x_train,x_test,y_train,y_test =tts(X,Y,test_size = 0.3) 
x_train[1]

'Ok lar... Joking wif u oni...'

A Function that prepares the data using natural language techniques

In [22]:
def dataProcessing(raw_data):
    stop_words = set(stopwords.words('english'))
    clean_data = []
    for row in raw_data:
        tokens = nltk.word_tokenize(row)
        prt = nltk.PorterStemmer()
        stems = []
        for token in tokens:
            if token not in stop_words:
                if token.isalnum() == True:
                    stems.append(token)
        sent = ' '.join(stems)
        clean_data.append(sent)
    return clean_data

Prepare the data using dataProcessing function

In [23]:
x_train = dataProcessing(x_train)
x_test = dataProcessing(x_test)

A function that extracts topics and returns tfidf structure for model use

In [24]:
vectorizer = CountVectorizer()
vectorizer.fit(x_train)
count_data = vectorizer.transform(x_train)
tfidf_model = TfidfTransformer()
tfidf_model.fit(count_data)
def topicExtracter(text_data):
    count_data = vectorizer.transform(text_data)
    return tfidf_model.transform(count_data)

In [25]:
x_train = topicExtracter(x_train)
x_test =topicExtracter(x_test)
print(x_train.shape)
print(x_test.shape)

(3900, 6771)
(1672, 6771)


Build and fit model

In [26]:
model = MultinomialNB()
model.fit(x_train,y_train)

MultinomialNB()

Test model Perfomance on test data

In [27]:
y_pred = model.predict(x_test)
print('Accuracy :{:.3f}%'.format(accuracy_score(y_test,y_pred) * 100))


Accuracy :95.993%


Program to Predict the Email to spam or ham

In [28]:
def classifyEmail(email_list):
    classes = ['ham','spam']
    x_pred = dataProcessing(email_list)
    x_pred = topicExtracter(x_pred)
    y_pred = model.predict(x_pred)
    return classes[y_pred[0]]


In [30]:
y_pred = classifyEmail(list(["Smile in Pleasure Smile in Pain Smile when trouble pours like Rain Smile when sum1 Hurts U Smile becoz SOMEONE still Loves to see u Smiling!!"]))
y_pred

'ham'