# 1.0 Load Packages

In [306]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.svm import SVC
from wordcloud import WordCloud
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# 2.0 Load Data

In [208]:
# data from:
# https://www.kaggle.com/uciml/sms-spam-collection-dataset
# file contains some invalid chars
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
print(df.shape)
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df.columns = ['labels','text']
df.reset_index(inplace=True)
print(df.info())
print(df.describe())
print(df.head())

(5572, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
index     5572 non-null int64
labels    5572 non-null object
text      5572 non-null object
dtypes: int64(1), object(2)
memory usage: 130.7+ KB
None
             index
count  5572.000000
mean   2785.500000
std    1608.642181
min       0.000000
25%    1392.750000
50%    2785.500000
75%    4178.250000
max    5571.000000
   index labels                                               text
0      0    ham  Go until jurong point, crazy.. Available only ...
1      1    ham                      Ok lar... Joking wif u oni...
2      2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      3    ham  U dun say so early hor... U c already then say...
4      4    ham  Nah I don't think he goes to usf, he lives aro...


In [209]:
# mapping label
tags_index = {'ham':0, 'spam':1}
df['label_mapped'] = df['labels'].map(tags_index)
Y = df['label_mapped']

# 3.0 Tokenization

In [216]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

## Define Stop Words
stopwords = stopwords.words('english')
# Add additional stopwords:
add_stopwords = ['ha','wa','even','one','doe']
stopwords = stopwords + add_stopwords

## Define Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

## Define customized preprocessor
def my_preprocessor(s):
    s = s.lower() # downcase
    tokens = word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if t.isalnum()] # remove all punctuations
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
#     bigrams = ngrams(tokens,2)
#     text = ' '.join(tokens)
    return tokens

tokenized = [my_preprocessor(text) for text in df['text']]

In [219]:
df['tokens'] = tokenized

In [220]:
print(df.head(1))

   index labels                                               text  \
0      0    ham  Go until jurong point, crazy.. Available only ...   

   label_mapped                                             tokens  
0             0  [jurong, point, available, bugis, great, world...  


# 4.0 Train Test Split

In [277]:
X_train_doc, X_test_doc, y_train, y_test = train_test_split(df,Y,test_size=0.33,random_state = 711)

In [278]:
train_documents = [TaggedDocument(words= doc, tags= [i]) for i, doc in enumerate(X_train_doc['tokens'])]
test_documents = [TaggedDocument(words= doc, tags= [i]) for i, doc in enumerate(X_test_doc['tokens'])]

In [279]:
print(train_documents[0])

TaggedDocument(['alright', 'set', 'text', 'man'], [0])


# 5.0 Doc2Vec Embedding

In [280]:
import multiprocessing
cores = multiprocessing.cpu_count()
print(cores)

8


In [281]:
from tqdm import tqdm
model_dbow = Doc2Vec(dm=1, vector_size=100, negative=5, hs=0, min_count=2, sample = 0, workers=cores, alpha=0.025, min_alpha=0.001)
model_dbow.build_vocab([x for x in tqdm(train_documents)])
# train_documents  = utils.shuffle(train_documents)
model_dbow.train(train_documents,total_examples=len(train_documents), epochs=30)

100%|██████████| 3733/3733 [00:00<00:00, 1865301.03it/s]


In [282]:
def vector_for_learning(model, input_docs):
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in input_docs])
    return targets, feature_vectors

In [283]:
_, X_train = vector_for_learning(model_dbow, train_documents)
_, X_test = vector_for_learning(model_dbow, test_documents)

In [284]:
X_train[0]

array([-0.02697269, -0.0407169 ,  0.03097039,  0.03714674,  0.00742686,
        0.02747871,  0.0303512 , -0.00036291, -0.01287164, -0.04531964,
       -0.01660654,  0.0327225 , -0.01042068, -0.02561682, -0.01445935,
        0.03337326,  0.02266306, -0.0205961 , -0.02181766, -0.03188667,
        0.0102182 , -0.02333212, -0.0165922 ,  0.0302441 ,  0.05187189,
        0.02268223,  0.03847696,  0.02918115,  0.02198733, -0.01692997,
        0.04058788, -0.04199082,  0.01525499, -0.02617353,  0.04757469,
        0.01004474, -0.00023499,  0.04599861,  0.04057302,  0.0257479 ,
       -0.00690177,  0.00860005, -0.01372077, -0.01746124, -0.03105097,
        0.012309  , -0.02027664,  0.07069066, -0.03281707, -0.02304384,
       -0.01463537, -0.00594229,  0.01426768,  0.00177333, -0.0079196 ,
        0.03710762, -0.00591717, -0.00408066,  0.04384585,  0.00175703,
       -0.0236137 ,  0.06762215,  0.0036565 ,  0.04077871, -0.03323013,
        0.01621639,  0.00189614, -0.00909267, -0.02649643,  0.00

# 6.0 Train Model

In [285]:
clf = LogisticRegression(n_jobs=1,C=1e5)
clf.fit(X_train,y_train)



LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=1, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [286]:
print('training score is:',clf.score(X_train,y_train))
print('testing score is:', clf.score(X_test,y_test))

training score is: 0.9541923386016609
testing score is: 0.9390973355084284


In [287]:
roc=roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
print(roc)

0.9225075864110724


In [288]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(
        n_jobs=-1
)
clf_rf.fit(X_train, y_train)
print('training score is:',clf_rf.score(X_train,y_train))
print('testing score is:', clf_rf.score(X_test,y_test))
print('testing AUC is:',roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))



training score is: 0.995981784087865
testing score is: 0.9439912996193583
testing AUC is: 0.9225075864110724


# 6.1 Cross Validation

In [289]:
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate
param_grid = {
    'max_depth': [1,2,]
    ,'max_features': [5,10]
    ,'n_estimators': [100]
}
grid_search = GridSearchCV(clf_rf,param_grid,cv=5,scoring = 'roc_auc',verbose=3)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] max_depth=1, max_features=5, n_estimators=100 ...................
[CV]  max_depth=1, max_features=5, n_estimators=100, score=0.953097012117107, total=   0.2s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] max_depth=1, max_features=5, n_estimators=100 ...................
[CV]  max_depth=1, max_features=5, n_estimators=100, score=0.9337057303541572, total=   0.2s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV] max_depth=1, max_features=5, n_estimators=100 ...................
[CV]  max_depth=1, max_features=5, n_estimators=100, score=0.9418234685178419, total=   0.2s
[CV] max_depth=1, max_features=5, n_estimators=100 ...................
[CV]  max_depth=1, max_features=5, n_estimators=100, score=0.9497777371818292, total=   0.2s
[CV] max_depth=1, max_features=5, n_estimators=100 ...................
[CV]  max_depth=1, max_features=5, n_estimators=100, score=0.9483162830349532, total=   0.2s
[CV] max_depth=1, max_features=10, n_estimators=100 ..................
[CV]  max_depth=1, max_features=10, n_estimators=100, score=0.9560848950101604, total=   0.2s
[CV] max_depth=1, max_features=10, n_estimators=100 ..................
[CV]  max_depth=1, max_features=10, n_estimators=100, score=0.9239778081775345, total=   0.2s
[CV] max_depth=1, max_features=10, n_estimators=100 ..................
[CV]  max_depth=1, max_features=10, n_estimators=100, score=0.93555900621118, total=   0.2s
[CV] max_depth=

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.6s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [1, 2], 'max_features': [5, 10], 'n_estimators': [100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [290]:
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.scorer_)

{'max_depth': 2, 'max_features': 5, 'n_estimators': 100}
0.9495459790352425
make_scorer(roc_auc_score, needs_threshold=True)


In [307]:
best_clf = grid_search.best_estimator_
print(best_clf.score(X_test,y_test,))
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,best_clf.predict(X_test)))
print(classification_report(y_test,best_clf.predict(X_test)))

0.9059271343121261
0.9059271343121261
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1603
           1       0.98      0.27      0.43       236

   micro avg       0.91      0.91      0.91      1839
   macro avg       0.94      0.64      0.69      1839
weighted avg       0.91      0.91      0.88      1839



In [297]:
X_test_doc['y_true'] = y_test
X_test_doc['y_pred'] = best_clf.predict(X_test)
X_test_doc['incorrect'] = y_test != best_clf.predict(X_test)
X_test_doc.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(1839, 8)

In [305]:
pd.set_option('max_colwidth',1000)
X_test_doc.loc[X_test_doc['incorrect']==True,['text','y_true','y_pred']]

Unnamed: 0,text,y_true,y_pred
3227,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info",1,0
3090,LORD OF THE RINGS:RETURN OF THE KING in store NOW!REPLY LOTR by 2 June 4 Chance 2 WIN LOTR soundtrack CDs StdTxtRate. Reply STOP to end txts,1,0
1779,"Loan for any purpose å£500 - å£75,000. Homeowners + Tenants welcome. Have you been previously refused? We can still help. Call Free 0800 1956669 or text back 'help'",1,0
4371,"Ur balance is now å£600. Next question: Complete the landmark, Big, A. Bob, B. Barry or C. Ben ?. Text A, B or C to 83738. Good luck!",1,0
4947,"Hi this is Amy, we will be sending you a free phone number in a couple of days, which will give you an access to all the adult parties...",1,0
5370,dating:i have had two of these. Only started after i sent a text to talk sport radio last week. Any connection do you think or coincidence?,1,0
4901,* FREE* POLYPHONIC RINGTONE Text SUPER to 87131 to get your FREE POLY TONE of the week now! 16 SN PoBox202 NR31 7ZS subscription 450pw,1,0
1886,Dear 0776xxxxxxx U've been invited to XCHAT. This is our final attempt to contact u! Txt CHAT to 86688 150p/MsgrcvdHG/Suite342/2Lands/Row/W1J6HL LDN 18yrs,1,0
1273,network operator. The service is free. For T & C's visit 80488.biz,1,0
528,You will recieve your tone within the next 24hrs. For Terms and conditions please see Channel U Teletext Pg 750,1,0
