In [1]:
import re
import os
import time
import joblib 

import numpy as np
import pandas as pd
import scipy.sparse as sp

from datetime import datetime

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-01-21


In [2]:
import json
import urlextract
from nltk.stem import WordNetLemmatizer

def load_data(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_train = load_data("X_train")
y_train = load_data("y_train")

y = y_train.copy()

# transform y_array into int type
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load contractions map for custom cleanup
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

In [3]:
import custom.clean_preprocess as cp
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)),
                 ('tfidf', TfidfTransformer(sublinear_tf=True))])

X_tfidf = pipe.fit_transform(X_train)

In [4]:
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import svd_flip
from sklearn.preprocessing import MaxAbsScaler

def perform_SVD(X, n_components=300): 
    
    X_array = X.asfptype()
    U, Sigma, VT = svds(X_array.T, # term-document matrix
                        k=n_components)
    # reverse outputs
    Sigma = Sigma[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])
    
    # return V 
    V = VT.T
    scaler = MaxAbsScaler()
    V_scaled = scaler.fit_transform(V)
    return V_scaled # scaled for Logistic Regression

X_tfidf_svd = perform_SVD(X_tfidf, n_components=800)

In [5]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

X_tfidf_svd_allcos = cosine_similarity(X_tfidf_svd)

train_df = pd.DataFrame({'sms':X_train, 'target':y_train})

# get spam indexes
spam_ix = train_df.loc[train_df['target']=='spam'].index

# calculate average spam similarity on SVD
mean_spam_sims = []

for ix in range(X_tfidf_svd_allcos.shape[0]):
    mean_spam_sims.append(np.mean(X_tfidf_svd_allcos[ix, spam_ix]))

X_tfidf_svd_spamcos = sp.hstack((csr_matrix(mean_spam_sims).T, X_tfidf_svd)) 

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, recall_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_tfidf_svd_spamcos,
                                                  y, 
                                                  stratify=y,
                                                  random_state=42)

In [26]:
# instantiate estimators
log_clf = LogisticRegression(
    solver="liblinear"
    , random_state=42
)

rnd_clf = RandomForestClassifier(
    n_jobs=-1
    , random_state=42
    , max_depth=8
    , max_features=150
    , min_samples_split=3
    , n_estimators=100
)

svm_clf = SVC(
    random_state=42
)

svm_clf_prob = SVC(
    random_state=42
    , probability=True
)

vot_clf_hard = VotingClassifier(
    estimators=[('log', log_clf), ('rnd', rnd_clf), ('svm', svm_clf)]
    , voting='hard'
)

vot_clf_soft = VotingClassifier(
    estimators=[('log', log_clf), ('rnd', rnd_clf), ('svm', svm_clf_prob)]
    , voting='soft'
)

In [27]:
def quick_eval(classifiers):
    for clf in classifiers:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        print(clf.__class__, __name__, 'acc', round(accuracy_score(y_val, y_pred), 4))
        print(clf.__class__, __name__, 'tpr', round(recall_score(y_val, y_pred, pos_label=1) , 4))
        print(clf.__class__, __name__, 'tnr', round(recall_score(y_val, y_pred, pos_label=0), 4))

In [28]:
quick_eval([log_clf, rnd_clf, svm_clf, vot_clf_hard])

<class 'sklearn.linear_model._logistic.LogisticRegression'> __main__ acc 0.9723
<class 'sklearn.linear_model._logistic.LogisticRegression'> __main__ tpr 0.8062
<class 'sklearn.linear_model._logistic.LogisticRegression'> __main__ tnr 0.9976
<class 'sklearn.ensemble._forest.RandomForestClassifier'> __main__ acc 0.9887
<class 'sklearn.ensemble._forest.RandomForestClassifier'> __main__ tpr 0.938
<class 'sklearn.ensemble._forest.RandomForestClassifier'> __main__ tnr 0.9965
<class 'sklearn.svm._classes.SVC'> __main__ acc 0.959
<class 'sklearn.svm._classes.SVC'> __main__ tpr 0.6899
<class 'sklearn.svm._classes.SVC'> __main__ tnr 1.0
<class 'sklearn.ensemble._voting.VotingClassifier'> __main__ acc 0.9744
<class 'sklearn.ensemble._voting.VotingClassifier'> __main__ tpr 0.8062
<class 'sklearn.ensemble._voting.VotingClassifier'> __main__ tnr 1.0


In [30]:
quick_eval([log_clf, rnd_clf, svm_clf_prob, vot_clf_soft])

<class 'sklearn.linear_model._logistic.LogisticRegression'> __main__ acc 0.9723
<class 'sklearn.linear_model._logistic.LogisticRegression'> __main__ tpr 0.8062
<class 'sklearn.linear_model._logistic.LogisticRegression'> __main__ tnr 0.9976
<class 'sklearn.ensemble._forest.RandomForestClassifier'> __main__ acc 0.9887
<class 'sklearn.ensemble._forest.RandomForestClassifier'> __main__ tpr 0.938
<class 'sklearn.ensemble._forest.RandomForestClassifier'> __main__ tnr 0.9965
<class 'sklearn.svm._classes.SVC'> __main__ acc 0.959
<class 'sklearn.svm._classes.SVC'> __main__ tpr 0.6899
<class 'sklearn.svm._classes.SVC'> __main__ tnr 1.0
<class 'sklearn.ensemble._voting.VotingClassifier'> __main__ acc 0.9836
<class 'sklearn.ensemble._voting.VotingClassifier'> __main__ tpr 0.8837
<class 'sklearn.ensemble._voting.VotingClassifier'> __main__ tnr 0.9988


Even after scaling to help out the logistic classifier, this ensemble still does more poorly than the highly optimized random forest model, which isn't performing its best because of the scaled SVD. A second attempt without the simplistic SVM classifier doesn't improve results.

It appears as if the *wisdom of the crowds* only works when everyone in the crowd is more or less clueless and makes mistakes. When we have an "expert" in the crowd, we should probably follow that expert, and so an ensemble will not necessarily perform better.

In [32]:
svm_clf_prob.fit(X_train, y_train)

SVC(probability=True, random_state=42)

In [55]:
y_prob = svm_clf_prob.predict_proba(X_val)
y_pred = svm_clf_prob.predict(X_val)

In [60]:
np.set_printoptions(precision=3, suppress=True) 
y_prob[0:10], y_pred[:10]

(array([[0.989, 0.011],
        [0.989, 0.011],
        [0.998, 0.002],
        [0.   , 1.   ],
        [0.991, 0.009],
        [0.992, 0.008],
        [0.   , 1.   ],
        [0.993, 0.007],
        [0.989, 0.011],
        [0.   , 1.   ]]),
 array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1]))

In [79]:
y_pred2 = []
for n,p in y_prob:
    if n > .5:
        y_pred2.append(0)
    else:
        y_pred2.append(1)

In [84]:
sum(y_pred), sum(y_pred2)

(89, 117)

In [85]:
print(round(accuracy_score(y_pred, y_pred), 5))
print(round(recall_score(y_val, y_pred, pos_label=1) , 5))
print(round(recall_score(y_val, y_pred, pos_label=0), 5))

1.0
0.68992
1.0


In [86]:
print(round(accuracy_score(y_pred2, y_pred), 5))
print(round(recall_score(y_val, y_pred2, pos_label=1) , 5))
print(round(recall_score(y_val, y_pred2, pos_label=0), 5))

0.97128
0.86822
0.99409



__Setting C__: `C=1.0`. If you have a lot of noisy observations you should decrease it: decreasing C corresponds to more regularization.
