# Voting Classifier

---

__This Notebook__

- Tries to implement a quick voting classifier using a baseline logistic classifier, the optimized random forest classifier, along with a quick SVM classifier, in the hopes that these estimators make different enough kinds of mistakes that the voting classifier outperforms them

__Results__ 

- It appears as if the *wisdom of the crowds* only works when everyone in the crowd is more or less clueless. When we have an "expert" in the crowd (the random forest model), we should probably follow that expert and not vote... so a voting classifier will not necessarily outperform nor recognize the expert


## Setup

In [1]:
import re
import os
import time
import joblib 

import numpy as np
import pandas as pd
import scipy.sparse as sp

from datetime import datetime

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-01-22


## Load

Need to rerun the pipeline and scale the SVD otherwise the logistic classifier will perform badly.

In [2]:
import json
import urlextract
from nltk.stem import WordNetLemmatizer

def load_data(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_train = load_data("X_train")
y_train = load_data("y_train")

y = y_train.copy()

# transform y_array into int type
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load contractions map for custom cleanup
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

In [3]:
import custom.clean_preprocess as cp
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)),
                 ('tfidf', TfidfTransformer(sublinear_tf=True))])

X_tfidf = pipe.fit_transform(X_train)

In [4]:
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import svd_flip
from sklearn.preprocessing import MaxAbsScaler

def perform_SVD(X, n_components=300): 
    
    X_array = X.asfptype()
    U, Sigma, VT = svds(X_array.T, # term-document matrix
                        k=n_components)
    # reverse outputs
    Sigma = Sigma[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])
    
    # return V 
    V = VT.T
    scaler = MaxAbsScaler()
    V_scaled = scaler.fit_transform(V)
    return V_scaled # scaled for Logistic Regression

X_tfidf_svd = perform_SVD(X_tfidf, n_components=800)

In [5]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

X_tfidf_svd_allcos = cosine_similarity(X_tfidf_svd)

train_df = pd.DataFrame({'sms':X_train, 'target':y_train})

# get spam indexes
spam_ix = train_df.loc[train_df['target']=='spam'].index

# calculate average spam similarity on SVD
mean_spam_sims = []

for ix in range(X_tfidf_svd_allcos.shape[0]):
    mean_spam_sims.append(np.mean(X_tfidf_svd_allcos[ix, spam_ix]))

X_tfidf_svd800_spamcos_scaled = sp.hstack((csr_matrix(mean_spam_sims).T, X_tfidf_svd)) 

__Persist__

In [7]:
# save
proc_dir = os.path.join("data", "2_processed")
filename = 'X_tfidf_svd800_spamcos_scaled.npz'
sp.save_npz(os.path.join(proc_dir, filename), X_tfidf_svd800_spamcos_scaled)

## Train Test Split

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

X_train, X_val, y_train, y_val = train_test_split(X_tfidf_svd800_spamcos_scaled,
                                                  y, 
                                                  stratify=y,
                                                  random_state=42)

In [9]:
# instantiate estimators
log_clf = LogisticRegression(
    solver="liblinear"
    , random_state=42
)

rnd_clf = RandomForestClassifier(
    n_jobs=-1
    , random_state=42
    , max_depth=8
    , max_features=150
    , min_samples_split=3
    , n_estimators=100
)

svm_clf = SVC(
    random_state=42
)

svm_clf_prob = SVC(
    random_state=42
    , probability=True
)

vot_clf_hard = VotingClassifier(
    estimators=[('log', log_clf), ('rnd', rnd_clf), ('svm', svm_clf)]
    , voting='hard'
)

vot_clf_soft = VotingClassifier(
    estimators=[('log', log_clf), ('rnd', rnd_clf), ('svm', svm_clf_prob)]
    , voting='soft'
)

In [10]:
def quick_eval(classifiers):
    for clf in classifiers:
        T1 = time.time()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        mins, secs = divmod(time.time() - T1, 60)
        print(clf.__class__, 'acc', round(accuracy_score(y_val, y_pred), 4))
        print(clf.__class__, 'tpr', round(recall_score(y_val, y_pred, pos_label=1) , 4))
        print(clf.__class__, 'tnr', round(recall_score(y_val, y_pred, pos_label=0), 4))
        print(f'{clf.__class__} - train time: {mins:0.0f}m {secs:0.0f}s')

In [11]:
quick_eval([log_clf, rnd_clf, svm_clf, vot_clf_hard])

<class 'sklearn.linear_model._logistic.LogisticRegression'> acc 0.9723
<class 'sklearn.linear_model._logistic.LogisticRegression'> tpr 0.8062
<class 'sklearn.linear_model._logistic.LogisticRegression'> tnr 0.9976
<class 'sklearn.linear_model._logistic.LogisticRegression'> - train time: 0m 0s
<class 'sklearn.ensemble._forest.RandomForestClassifier'> acc 0.9887
<class 'sklearn.ensemble._forest.RandomForestClassifier'> tpr 0.938
<class 'sklearn.ensemble._forest.RandomForestClassifier'> tnr 0.9965
<class 'sklearn.ensemble._forest.RandomForestClassifier'> - train time: 0m 9s
<class 'sklearn.svm._classes.SVC'> acc 0.959
<class 'sklearn.svm._classes.SVC'> tpr 0.6899
<class 'sklearn.svm._classes.SVC'> tnr 1.0
<class 'sklearn.svm._classes.SVC'> - train time: 0m 24s
<class 'sklearn.ensemble._voting.VotingClassifier'> acc 0.9744
<class 'sklearn.ensemble._voting.VotingClassifier'> tpr 0.8062
<class 'sklearn.ensemble._voting.VotingClassifier'> tnr 1.0
<class 'sklearn.ensemble._voting.VotingClassifi

In [12]:
quick_eval([svm_clf_prob, vot_clf_soft])

<class 'sklearn.svm._classes.SVC'> acc 0.959
<class 'sklearn.svm._classes.SVC'> tpr 0.6899
<class 'sklearn.svm._classes.SVC'> tnr 1.0
<class 'sklearn.svm._classes.SVC'> - train time: 1m 37s
<class 'sklearn.ensemble._voting.VotingClassifier'> acc 0.9836
<class 'sklearn.ensemble._voting.VotingClassifier'> tpr 0.8837
<class 'sklearn.ensemble._voting.VotingClassifier'> tnr 0.9988
<class 'sklearn.ensemble._voting.VotingClassifier'> - train time: 1m 48s


Using the `.predict` method on the SVM classifier that trains using `probability=True` is just a waste of training time...

Even after scaling to help out the logistic classifier, this ensemble still does more poorly than the highly optimized random forest model, which isn't performing its best because of the scaled SVD. 

### Unscaled

In [13]:
# load previously saved unscaled SVD
filename = 'X_tfidf_svd800_spamcos.npz'
X_tfidf_svd800_spamcos = sp.load_npz(os.path.join(proc_dir, filename))

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_tfidf_svd800_spamcos,
                                                  y, 
                                                  stratify=y,
                                                  random_state=42)

In [15]:
vot_clf_soft_noLR = VotingClassifier(
    estimators=[('rnd', rnd_clf), ('svm', svm_clf_prob)]
    , voting='soft'
)

In [16]:
# only need to test random forest and new soft voting without logistic classifier
quick_eval([rnd_clf, vot_clf_soft_noLR])

<class 'sklearn.ensemble._forest.RandomForestClassifier'> acc 0.9928
<class 'sklearn.ensemble._forest.RandomForestClassifier'> tpr 0.969
<class 'sklearn.ensemble._forest.RandomForestClassifier'> tnr 0.9965
<class 'sklearn.ensemble._forest.RandomForestClassifier'> - train time: 0m 8s
<class 'sklearn.ensemble._voting.VotingClassifier'> acc 0.9867
<class 'sklearn.ensemble._voting.VotingClassifier'> tpr 0.9147
<class 'sklearn.ensemble._voting.VotingClassifier'> tnr 0.9976
<class 'sklearn.ensemble._voting.VotingClassifier'> - train time: 1m 48s


The unscaled SVD performs better, the logistic classifier was just pulling the voting down.

### Decision Thresholds

Briefly comparing the `.predict_proba` and `.predict` methods as `.predict_proba` allows for changing decision thresholds.

In [17]:
svm_clf_prob.fit(X_train, y_train)

SVC(probability=True, random_state=42)

In [18]:
y_prob = svm_clf_prob.predict_proba(X_val)
y_pred = svm_clf_prob.predict(X_val)

In [19]:
np.set_printoptions(precision=3, suppress=True) 
y_prob[0:10], y_pred[:10]

(array([[0.991, 0.009],
        [0.988, 0.012],
        [0.996, 0.004],
        [0.   , 1.   ],
        [0.992, 0.008],
        [0.989, 0.011],
        [0.   , 1.   ],
        [0.989, 0.011],
        [0.993, 0.007],
        [0.   , 1.   ]]),
 array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1]))

In [20]:
def change_thresh(y_prob, tresh):
    y_pred = []
    for n, p in y_prob:
        if n > tresh:
            y_pred.append(0)
        else:
            y_pred.append(1)
    print(f'acc: {accuracy_score(y_val, y_pred):0.4f}')
    print(f'tpr: {recall_score(y_val, y_pred, pos_label=1):0.4f}')
    print(f'tnr: {recall_score(y_val, y_pred, pos_label=0):0.4f}')

In [21]:
change_thresh(y_prob, .5)

acc: 0.9764
tpr: 0.8605
tnr: 0.9941


In [22]:
change_thresh(y_prob, .6)

acc: 0.9774
tpr: 0.8760
tnr: 0.9929


In [23]:
change_thresh(y_prob, .7)

acc: 0.9754
tpr: 0.8915
tnr: 0.9882


In [24]:
change_thresh(y_prob, .8)

acc: 0.9764
tpr: 0.9147
tnr: 0.9858


In [25]:
change_thresh(y_prob, .9)

acc: 0.9713
tpr: 0.9302
tnr: 0.9775


### Adjusting the class weight according to the data

- Alternatively, we could pass `class_weight={0:1, 1:7}` to balance out the classes...
- However, on testing `{0:1, 1:7}` and `{0:7, 1:1}` I got similar and confusing results

In [26]:
svm_clf_prob_bal = SVC(
    random_state=42
    , probability=True
    , class_weight='balanced'
)

In [27]:
svm_clf_prob_bal.fit(X_train, y_train)

SVC(class_weight='balanced', probability=True, random_state=42)

In [28]:
y_prob = svm_clf_prob_bal.predict_proba(X_val)

In [29]:
change_thresh(y_prob, .5)

acc: 0.9764
tpr: 0.8682
tnr: 0.9929


In [30]:
change_thresh(y_prob, .6)

acc: 0.9764
tpr: 0.8760
tnr: 0.9917


In [31]:
change_thresh(y_prob, .7)

acc: 0.9764
tpr: 0.8915
tnr: 0.9894


In [32]:
change_thresh(y_prob, .8)

acc: 0.9744
tpr: 0.9070
tnr: 0.9846


In [33]:
change_thresh(y_prob, .9)

acc: 0.9692
tpr: 0.9225
tnr: 0.9764


### Regularization


__Setting C__: `C=1.0`. If you have a lot of noisy observations you should decrease it: decreasing C corresponds to more regularization.


In [34]:
svm_clf_prob_reg = SVC(
    random_state=42
    , probability=True
    , class_weight='balanced'
    , C=.7
)

In [35]:
svm_clf_prob_reg.fit(X_train, y_train)

SVC(C=0.7, class_weight='balanced', probability=True, random_state=42)

In [36]:
y_prob = svm_clf_prob_reg.predict_proba(X_val)

In [37]:
change_thresh(y_prob, .5)

acc: 0.9764
tpr: 0.8682
tnr: 0.9929


In [38]:
change_thresh(y_prob, .6)

acc: 0.9764
tpr: 0.8760
tnr: 0.9917


In [39]:
change_thresh(y_prob, .7)

acc: 0.9754
tpr: 0.8915
tnr: 0.9882


In [40]:
change_thresh(y_prob, .8)

acc: 0.9723
tpr: 0.9070
tnr: 0.9823


In [41]:
change_thresh(y_prob, .9)

acc: 0.9672
tpr: 0.9225
tnr: 0.9740


---