In [41]:
# pip install nltk

In [72]:
#RUN TO DOWNLOAD DEPENDENCIES
import joblib
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/ak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [73]:
import pandas as pd

In [74]:
training_data = pd.read_csv('training_data.csv')
training_data.head()

Unnamed: 0,msg_text,ham_or_spam
0,"Haha better late than ever, any way I could sw...",ham
1,Thanks a lot for your wishes on my birthday. T...,ham
2,A guy who gets used but is too dumb to realize...,ham
3,Talk sexy!! Make new friends or fall in love i...,spam
4,Nowadays people are notixiquating the laxinorf...,ham


In [75]:
data = training_data.to_numpy()
data

array([['Haha better late than ever, any way I could swing by?', 'ham'],
       ['Thanks a lot for your wishes on my birthday. Thanks you for making my birthday truly memorable.',
        'ham'],
       ['A guy who gets used but is too dumb to realize it.', 'ham'],
       ...,
       ["Prabha..i'm soryda..realy..frm heart i'm sory", 'ham'],
       ['Nt joking seriously i told', 'ham'],
       ['Did he just say somebody is named tampa', 'ham']], dtype=object)

In [76]:
X = data[:, 0]
y = data[:, 1]
X.shape, y.shape

((5014,), (5014,))

##  Tokenization

In [77]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+')

## Stopwords

In [78]:
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

## Tokenizing -> Stopwords Removal -> Stemming 

In [79]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemming(sentence):
    sentence = sentence.lower()
    tokens = tokenizer.tokenize(sentence) 
    removed_stopwords = [w for w in tokens if w not in sw]
    stemmed_words = [ps.stem(token) for token in removed_stopwords]
    clean_sentence = ' '.join(stemmed_words)
    return clean_sentence

In [80]:
# GET A CLEAN DATASET
def getClean(document):
    d = []
    for doc in document:
        d.append(stemming(doc))
    return d

In [81]:
stemmed_doc = getClean(X)

## Count Vectorization

In [82]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [83]:
vc = cv.fit_transform(stemmed_doc)

In [84]:
X = vc.todense()

## Splitting into training and testing data to calculate model accuracy

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Implementing Naive Bayes' Algorithm 

In [86]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
print(y_train)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'spam']


## Model Accuracy

In [87]:
model.score(X_test, y_test)

0.9840478564307079

In [89]:
# joblib.dump(cv, 'cv.pkl')
# joblib.dump(model, 'model.pkl')

## Prediction on Test Data

In [90]:
test_data = pd.read_csv('test_data.csv')

test = test_data.to_numpy()
test_messages = test[:, 0]
type(test)
test_messages.shape

(558,)

In [91]:
def prepare(messages):
    d = getClean(messages)
    return cv.transform(d)

In [92]:
messages = prepare(test_messages)
y_pred = model.predict(messages)

In [93]:
test_data['ham_or_spam'] = y_pred
test_data

Unnamed: 0,msg_text,ham_or_spam
0,Squeeeeeze!! This is christmas hug.. If u lik ...,ham
1,And also I've sorta blown him off a couple tim...,ham
2,Mmm thats better now i got a roast down me! i...,ham
3,Mm have some kanji dont eat anything heavy ok,ham
4,So there's a ring that comes with the guys cos...,ham
...,...,...
553,Heart is empty without love.. Mind is empty wi...,ham
554,"Alright we'll bring it to you, see you in like...",ham
555,If You mean the website. Yes.,ham
556,"Ya they are well and fine., BBD(pooja) full pi...",ham


## Exporting to csv

In [94]:
test_data.to_csv('Predicted.csv')