# Model Evaluation 1

---

__This Notebook__

Looks at what kinds of text messages trip up the classifiers in the training and test sets and comparing them might help understand whay needed to be done - this should've been done during training and modeling but I just thought about it.


## Setup

In [1]:
import re
import os
import sys
import time
import joblib 

import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt

from datetime import datetime

from sklearn.ensemble import AdaBoostClassifier, \
    RandomForestClassifier, GradientBoostingClassifier

import custom.evaluate_models as E

# set print options, print revision date
np.set_printoptions(threshold=sys.maxsize)
pd.options.display.max_colwidth = 999
dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-02-23


## Load Raw Data

In [2]:
def load_raw(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_train_raw = load_raw("X_train")
X_test_raw = load_raw("X_test")
y_train_array = load_raw("y_train")
y_test_array = load_raw("y_test") 

def make_int(y_array):
    y = y_array.copy()
    y[y=='ham'] = 0
    y[y=='spam'] = 1
    y = y.astype('int')
    return y

y_train = make_int(y_train_array)
y_test = make_int(y_test_array)

## Load Preprocessed Data

In [3]:
def load_X(filename):
    proc_dir = os.path.join("data", "2_processed")
    filename = ''.join([filename, '.npz'])
    X = sp.load_npz(os.path.join(proc_dir, filename))
    return X

X_train_processed = load_X('X_train_processed')
X_test_processed = load_X('X_test_processed')

## Instantiate Candidate Models

In [4]:
# remember to use warm_start=True for learning curves
rnd_clf1 = RandomForestClassifier(
    random_state=42, n_estimators=100, max_features=150, 
    max_depth=8, min_samples_split=3, n_jobs=1) 

rnd_clf2 = RandomForestClassifier(
    random_state=42, n_estimators=100, max_features=300, 
    max_depth=8, min_samples_split=3, n_jobs=1)
    
ada_clf =  AdaBoostClassifier(
    random_state=42 , n_estimators=10, 
    learning_rate=0.001)

gbc1a = GradientBoostingClassifier(
    random_state=42, n_estimators=50, max_features=None, 
    max_depth=1, min_samples_split=2)

gbc2a = GradientBoostingClassifier(
    random_state=42, n_estimators=100, max_features=300, 
    max_depth=8, min_samples_split=5)

gbc2c = GradientBoostingClassifier(
    random_state=42, n_estimators=50, max_features=300, 
    max_depth=3, min_samples_split=5)

## AdaBoost

In [5]:
# ada
E.fit_clf(ada_clf, X_train_processed, y_train)
y_pred = ada_clf.predict(X_test_processed)
E.eval_clf(y_test, y_pred)

Elapsed: 0m 4s
          pred_neg  pred_pos
cond_neg      1432        10
cond_pos         8       222
acc: 0.9892
tpr: 0.9652
tnr: 0.9931


In [6]:
# assumes y_pred, y_test, X_test_raw
def get_mistakes():
    indices = []
    for ix, pred in enumerate(y_pred): 
        if pred != y_test[ix]:
            indices.append(ix)
    X_errors = X_test_raw[indices]
    data = {'y_pred': y_pred[indices], 
            'X_raw':X_errors}
    df = pd.DataFrame(data)
    return df

df = get_mistakes()

# predicted spam but was legit
df[df['y_pred'] == 1]

Unnamed: 0,y_pred,X_raw
2,1,"Funny fact Nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife Natural disasters just happens"
3,1,"4 tacos + 1 rajas burrito, right?"
4,1,Your daily text from me ‰ÛÒ a favour this time
5,1,I (Career Tel) have added u as a contact on INDYAROCKS.COM to send FREE SMS. To remove from phonebook - sms NO to &lt;#&gt;
6,1,645
7,1,"Sometimes we put walls around our hearts,not just to be safe from getting hurt.. But to find out who cares enough to break the walls &amp; get closer.. GOODNOON:)"
8,1,Ok... Ur typical reply...
10,1,Your bill at 3 is å£33.65 so thats not bad!
11,1,2marrow only. Wed at &lt;#&gt; to 2 aha.
17,1,"Hi, Mobile no. &lt;#&gt; has added you in their contact list on www.fullonsms.com It s a great place to send free sms to people For more visit fullonsms.com"


In [7]:
# predicted ham but was spam
df[df['y_pred'] == 0]

Unnamed: 0,y_pred,X_raw
0,0,"Hi this is Amy, we will be sending you a free phone number in a couple of days, which will give you an access to all the adult parties..."
1,0,"Goal! Arsenal 4 (Henry, 7 v Liverpool 2 Henry scores with a simple shot from 6 yards from a pass by Bergkamp to give Arsenal a 2 goal margin after 78 mins."
9,0,"Latest News! Police station toilet stolen, cops have nothing to go on!"
12,0,"Xmas & New Years Eve tickets are now on sale from the club, during the day from 10am till 8pm, and on Thurs, Fri & Sat night this week. They're selling fast!"
13,0,"Hello darling how are you today? I would love to have a chat, why dont you tell me what you look like and what you are in to sexy?"
14,0,TBS/PERSOLVO. been chasing us since Sept forå£38 definitely not paying now thanks to your information. We will ignore them. Kath. Manchester.
15,0,Money i have won wining number 946 wot do i do next
16,0,For sale - arsenal dartboard. Good condition but no doubles or trebles!


## Random Forest 1

In [8]:
E.fit_clf(rnd_clf1, X_train_processed, y_train)
y_pred = rnd_clf1.predict(X_test_processed)
E.eval_clf(y_test, y_pred)

Elapsed: 1m 1s
          pred_neg  pred_pos
cond_neg      1433         9
cond_pos         7       223
acc: 0.9904
tpr: 0.9696
tnr: 0.9938


In [9]:
df = get_mistakes()

# predicted spam but was legit
df[df['y_pred'] == 1]

Unnamed: 0,y_pred,X_raw
2,1,"Funny fact Nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife Natural disasters just happens"
3,1,"4 tacos + 1 rajas burrito, right?"
4,1,Your daily text from me ‰ÛÒ a favour this time
5,1,I (Career Tel) have added u as a contact on INDYAROCKS.COM to send FREE SMS. To remove from phonebook - sms NO to &lt;#&gt;
6,1,645
7,1,"Sometimes we put walls around our hearts,not just to be safe from getting hurt.. But to find out who cares enough to break the walls &amp; get closer.. GOODNOON:)"
8,1,Ok... Ur typical reply...
10,1,Your bill at 3 is å£33.65 so thats not bad!
11,1,2marrow only. Wed at &lt;#&gt; to 2 aha.


In [10]:
# predicted ham but was spam
df[df['y_pred'] == 0]

Unnamed: 0,y_pred,X_raw
0,0,"Hi this is Amy, we will be sending you a free phone number in a couple of days, which will give you an access to all the adult parties..."
1,0,"Goal! Arsenal 4 (Henry, 7 v Liverpool 2 Henry scores with a simple shot from 6 yards from a pass by Bergkamp to give Arsenal a 2 goal margin after 78 mins."
9,0,"Latest News! Police station toilet stolen, cops have nothing to go on!"
12,0,"Xmas & New Years Eve tickets are now on sale from the club, during the day from 10am till 8pm, and on Thurs, Fri & Sat night this week. They're selling fast!"
13,0,"Hello darling how are you today? I would love to have a chat, why dont you tell me what you look like and what you are in to sexy?"
14,0,Money i have won wining number 946 wot do i do next
15,0,For sale - arsenal dartboard. Good condition but no doubles or trebles!


In [11]:
import custom.clean_preprocess as cp
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)), # careful
                 ('tfidf', TfidfTransformer(sublinear_tf=True))]) # careful

In [12]:
# counter
X_test_counter = pipe['counter'].fit_transform(X_test_raw)

In [14]:
indices = []
for ix, pred in enumerate(y_pred): 
    if pred != y_test[ix]:
        indices.append(ix)
print(X_test_counter[indices][0])

Counter({'you': 2, 'hi': 1, 'this': 1, 'amy': 1, 'we': 1, 'sending': 1, 'free': 1, 'phone': 1, 'number': 1, 'couple': 1, 'day': 1, 'which': 1, 'give': 1, 'access': 1, 'all': 1, 'adult': 1, 'party': 1, 'hi_this': 1, 'this_is': 1, 'is_amy': 1, 'amy_we': 1, 'we_will': 1, 'will_be': 1, 'be_sending': 1, 'sending_you': 1, 'you_a': 1, 'a_free': 1, 'free_phone': 1, 'phone_number': 1, 'number_in': 1, 'in_a': 1, 'a_couple': 1, 'couple_of': 1, 'of_days': 1, 'days_which': 1, 'which_will': 1, 'will_give': 1, 'give_you': 1, 'you_an': 1, 'an_access': 1, 'access_to': 1, 'to_all': 1, 'all_the': 1, 'the_adult': 1, 'adult_parties': 1, 'hi_this_is': 1, 'this_is_amy': 1, 'is_amy_we': 1, 'amy_we_will': 1, 'we_will_be': 1, 'will_be_sending': 1, 'be_sending_you': 1, 'sending_you_a': 1, 'you_a_free': 1, 'a_free_phone': 1, 'free_phone_number': 1, 'phone_number_in': 1, 'number_in_a': 1, 'in_a_couple': 1, 'a_couple_of': 1, 'couple_of_days': 1, 'of_days_which': 1, 'days_which_will': 1, 'which_will_give': 1, 'will_

In [15]:
X_test_raw[668]

'Your bill at 3 is å£33.65 so thats not bad!'

In [16]:
print(X_test_counter[indices][10])

Counter({'NUM': 3, 'your': 1, 'bill': 1, 'EMOJI': 1, 'so': 1, 'thats': 1, 'not': 1, 'bad': 1, 'your_bill': 1, 'bill_at': 1, 'at_NUM': 1, 'NUM_is': 1, 'is_EMOJI': 1, 'EMOJI_NUM': 1, 'NUM_NUM': 1, 'NUM_so': 1, 'so_thats': 1, 'thats_not': 1, 'not_bad': 1, 'your_bill_at': 1, 'bill_at_NUM': 1, 'at_NUM_is': 1, 'NUM_is_EMOJI': 1, 'is_EMOJI_NUM': 1, 'EMOJI_NUM_NUM': 1, 'NUM_NUM_so': 1, 'NUM_so_thats': 1, 'so_thats_not': 1, 'thats_not_bad': 1})


---