In [160]:
import string
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import joblib
import pickle

In [179]:
cols = ['message', 'category']
data = pd.read_csv("EnglishDataSet.csv", usecols=cols)

In [180]:
data.shape

(5572, 2)

Text Pre-Processing

In [181]:
def text_process(message):
    
# remove punctuations
    message = [char for char in message if char not in string.punctuation]
# join the message after removing   
    message = ''.join(message)
    
    return [word for word in message.split() if word.lower() not in stopwords.words('english')]
    

In [182]:
# apply the text processing to messages column
data['message'].apply(text_process)


0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, Â£750, Po...
5568                   [Ã, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: message, Length: 5572, dtype: object

In [183]:
data.head()

Unnamed: 0,message,category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


Vectorization and Term-Frequency

In [184]:
# create a vectorizer
vectorizer = CountVectorizer().fit(data['message'])
vec_file = 'vectorizer'
pickle.dump(vectorizer, open(vec_file, 'wb'))

In [185]:
# message4 = data['message'][3]
# message4 = vectorizer.transform([message4])
# print(message4)

message = vectorizer.transform(data['message'])
print(message)

  (0, 1070)	1
  (0, 1304)	1
  (0, 1752)	1
  (0, 1754)	1
  (0, 2051)	1
  (0, 2330)	1
  (0, 3553)	1
  (0, 3597)	1
  (0, 3637)	1
  (0, 4091)	1
  (0, 4351)	1
  (0, 4477)	1
  (0, 5541)	1
  (0, 5924)	1
  (0, 7652)	1
  (0, 8037)	1
  (0, 8275)	1
  (0, 8498)	1
  (1, 4319)	1
  (1, 4513)	1
  (1, 5508)	1
  (1, 5537)	1
  (1, 8401)	1
  (2, 77)	1
  (2, 402)	1
  :	:
  (5570, 1781)	1
  (5570, 1789)	1
  (5570, 2595)	1
  (5570, 2895)	1
  (5570, 3311)	1
  (5570, 3361)	1
  (5570, 3473)	1
  (5570, 3690)	1
  (5570, 3784)	1
  (5570, 4091)	1
  (5570, 4165)	1
  (5570, 4222)	1
  (5570, 4617)	1
  (5570, 5338)	1
  (5570, 7046)	1
  (5570, 7056)	1
  (5570, 7634)	1
  (5570, 7763)	1
  (5570, 8072)	1
  (5570, 8321)	1
  (5571, 4229)	2
  (5571, 5248)	1
  (5571, 6510)	1
  (5571, 7763)	1
  (5571, 7892)	1


In [186]:
# transform the messages into bag of words using vectorizer
bag_of_words_messages = vectorizer.transform(data['message'])

In [188]:
print('Shape of Sparse Matrix: ', bag_of_words_messages.shape)
print('Amount of Non-Zero occurences: ', bag_of_words_messages.nnz)

Shape of Sparse Matrix:  (5572, 8666)
Amount of Non-Zero occurences:  73912


In [189]:
# Create TF-IDF for weights and normalization of vectors
tf_transformer = TfidfTransformer().fit(bag_of_words_messages)
vec_file = 'transformer'
pickle.dump(tf_transformer, open(vec_file, 'wb'))

In [172]:
tf4 = tf_transformer.transform(message)

  (0, 8498)	0.22080132794235655
  (0, 8275)	0.18238655630689804
  (0, 8037)	0.22998520738984352
  (0, 7652)	0.15566431601878158
  (0, 5924)	0.2553151503985779
  (0, 5541)	0.15618023117358304
  (0, 4477)	0.2757654045621182
  (0, 4351)	0.3264252905795869
  (0, 4091)	0.10720385321563428
  (0, 3637)	0.1803175103691124
  (0, 3597)	0.15318864840197105
  (0, 3553)	0.1481298737377147
  (0, 2330)	0.25279391746019725
  (0, 2051)	0.2757654045621182
  (0, 1754)	0.2757654045621182
  (0, 1752)	0.3116082237740733
  (0, 1304)	0.24415547176756056
  (0, 1070)	0.3264252905795869
  (1, 8401)	0.4316010362639011
  (1, 5537)	0.5465881710238072
  (1, 5508)	0.27211951321382544
  (1, 4513)	0.4082988561907181
  (1, 4319)	0.5236458071582338
  (2, 8456)	0.18669123587240305
  (2, 8414)	0.14511814920515034
  :	:
  (5570, 7056)	0.20534071141898738
  (5570, 7046)	0.18426479853595398
  (5570, 5338)	0.21003407910338884
  (5570, 4617)	0.15965284335787472
  (5570, 4222)	0.12258970642239425
  (5570, 4165)	0.282916228599017

In [190]:
tf_idf_messages = tf_transformer.transform(bag_of_words_messages)

In [191]:
print(tf_idf_messages.shape)

(5572, 8666)


Train Test Split


In [192]:
# create a training and testing set 
msg_train, msg_test, cat_train, cat_test = train_test_split(tf_idf_messages, data['category'], test_size = 0.3)

Fitting into Naive Bayes model

In [193]:
nb_classify = MultinomialNB().fit(msg_train,cat_train)

In [194]:
# print(msg_test)

# nb_classify.predict(msg_test)

predictionsNB = nb_classify.predict(msg_test)

In [195]:
print(classification_report(predictionsNB, cat_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1527
        spam       0.70      1.00      0.82       145

    accuracy                           0.96      1672
   macro avg       0.85      0.98      0.90      1672
weighted avg       0.97      0.96      0.97      1672



In [196]:
with open('naive_bayes','wb') as f:
    pickle.dump(nb_classify,f)

Fitting into Logistic Regression model

In [142]:
lr_classify = LogisticRegression().fit(msg_train,cat_train)

In [143]:
predictionsLR = lr_classify.predict(msg_test)

In [144]:
print(classification_report(predictionsLR, cat_test))

              precision    recall  f1-score   support

         ham       1.00      0.94      0.97      1554
        spam       0.54      0.99      0.70       118

    accuracy                           0.94      1672
   macro avg       0.77      0.96      0.83      1672
weighted avg       0.97      0.94      0.95      1672



In [145]:
with open('logistic_regression','wb') as f:
    pickle.dump(lr_classify,f)

Fitting into Decision Tree model

In [146]:
dt_classify = DecisionTreeClassifier().fit(msg_train,cat_train)

In [147]:
predictionsDT = dt_classify.predict(msg_test)

In [148]:
print(classification_report(predictionsDT, cat_test))

              precision    recall  f1-score   support

         ham       0.98      0.97      0.98      1470
        spam       0.82      0.88      0.84       202

    accuracy                           0.96      1672
   macro avg       0.90      0.92      0.91      1672
weighted avg       0.96      0.96      0.96      1672



In [149]:
with open('decision_tree','wb') as f:
    pickle.dump(dt_classify,f)