In [155]:
import pandas as pd

In [156]:
messages = pd.read_csv('smsspamcollection\SMSSpamCollection',sep='\t',names=['label','message'])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Cleaning & Processing

In [157]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [158]:
lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages.iloc[i]['message'])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

### Vectorization

In [159]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_bow = cv.fit_transform(corpus).toarray()

y_bow = pd.get_dummies(messages['label'])
y_bow = y_bow.iloc[:,1]

### Apply ML Algos

In [160]:
from sklearn.model_selection import train_test_split
# Train-Test-Split
X_train,X_test,y_train,y_test = train_test_split(X_bow,y_bow,test_size = 0.20,random_state = 42)

In [161]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

y_pred = spam_detect_model.predict(X_test)

In [162]:
from sklearn.metrics import classification_report,confusion_matrix

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       966
           1       0.86      0.96      0.91       149

    accuracy                           0.97      1115
   macro avg       0.93      0.97      0.95      1115
weighted avg       0.98      0.97      0.97      1115

[[943  23]
 [  6 143]]


### Lets see if we can optimize via Hyperparameter Tuning

In [163]:
import numpy as np
grid_params = {
  'alpha': np.linspace(0.5, 1.5, 6),
  'fit_prior': [True, False]
}

In [164]:
from sklearn.model_selection import GridSearchCV
clf = MultinomialNB()
clf_gc = GridSearchCV(estimator = clf,param_grid = grid_params,scoring = 'accuracy')
clf_gc = clf_gc.fit(X_train,y_train)

In [165]:
print("Best Score: ", clf_gc.best_score_)
print("Best Params: ", clf_gc.best_params_)

Best Score:  0.9762173302532048
Best Params:  {'alpha': 1.5, 'fit_prior': True}


In [166]:
clf_best = MultinomialNB(alpha = 1.5,fit_prior = True)
clf_best.fit(X_train,y_train)
y_pred = clf_best.predict(X_test)

In [167]:
from sklearn.metrics import classification_report,confusion_matrix

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       966
           1       0.89      0.95      0.92       149

    accuracy                           0.98      1115
   macro avg       0.94      0.97      0.95      1115
weighted avg       0.98      0.98      0.98      1115

[[948  18]
 [  7 142]]


### Lets apply it with RT Data

In [168]:
text_inp = 'FREE Get Free Coupon for 2000 Dollars and travel Mexico'

In [169]:
lemmatizer = WordNetLemmatizer()
corpus = []
review = re.sub('[^a-zA-Z]',' ',text_inp)
review = review.lower()
review = review.split()
review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)
corpus.append(review)
corpus

['free get free coupon dollar travel mexico']

In [170]:
X_test_rt = cv.transform(corpus)

In [171]:
X_test_rt

<1x7098 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [172]:
clf_best.predict(X_test_rt)

array([1], dtype=uint8)

In [175]:
import pickle
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(clf_best, open(filename, 'wb'))
# save tranformer to disk
filename = 'finalized_transformer.sav'
pickle.dump(cv, open(filename, 'wb'))