In [16]:
import numpy as np
import pandas as pd
import os
import sklearn
import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Custom functions 
def print_gridsearch_results(grid_searcher, unique_params):
    # For a given gridsearcher and the relevant params used in grid_search, print the results of the runs
    # Get the data as a pandas DF
    gsearch_results_df = pd.DataFrame(grid_searcher.cv_results_).copy()
    print("Dataframe has shape: %s" % (str(gsearch_results_df.shape)))
    n_trials_grid_search = gsearch_results_df.shape[0]
    print("Number of trials used in grid search: ", n_trials_grid_search)

    # Rearrange row order so it is easy to skim
    gsearch_results_df.sort_values('rank_test_score', inplace=True)
    # Transform param-text to match up with cv_results_ representation
    param_keys = [f"param_{key}" for key in unique_params]
    return(gsearch_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'mean_fit_time', 'rank_test_score']])



RANDOM_STATE = 123

In [2]:
# Plotting utils
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.25, style='whitegrid')

# Setup: Importing the Text


In [3]:
data_dir = '../data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Basic comparison of two vectorizers - one with counts and one using tfidf

In [4]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit_transform(x_train_text)
x_train_text_count = count_vectorizer.transform(x_train_text).toarray()


In [5]:
len(count_vectorizer.vocabulary_)

4255

In [6]:
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit_transform(x_train_text)
x_train_text_tf = tf_vectorizer.transform(x_train_text).toarray()

In [7]:
len(tf_vectorizer.vocabulary_)

4255

## Train two Random Forests to compare performance

This manual approach to training models is no longer neceesary, as the pipeline enables a much speedier and more effective way of performing this comparison while also tuning hyperparameters.


### Using pipeline to try many tf-idf hyperparameters

In [47]:
tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE)),
 ])
tfidf_parameters = {
    'tfidf__min_df': np.arange(1,6),
    'tfidf__max_df': (0.05, 0.1, 0.5, 1.0),
    'tfidf__ngram_range': [(1, 1), (1, 2), (2,2)],
}

tfidf_grid_searcher = GridSearchCV(
    tfidf_pipeline, 
    tfidf_parameters, 
    cv=7, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
tfidf_grid_searcher.fit(x_train_text, y_train)

Fitting 7 folds for each of 60 candidates, totalling 420 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   17.1s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   50.7s
[Parallel(n_jobs=3)]: Done 282 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 420 out of 420 | elapsed:  2.6min finished


GridSearchCV(cv=7,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words='english')),
                                       ('clf',
                                        RandomForestClassifier(random_state=123))]),
             n_jobs=3,
             param_grid={'tfidf__max_df': (0.05, 0.1, 0.5, 1.0),
                         'tfidf__min_df': array([1, 2, 3, 4, 5]),
                         'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [48]:
print_gridsearch_results(tfidf_grid_searcher, list(tfidf_parameters.keys()))

Dataframe has shape: (60, 27)
Number of trials used in grid search:  60


Unnamed: 0,param_tfidf__min_df,param_tfidf__max_df,param_tfidf__ngram_range,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
34,2,0.5,"(1, 2)",0.99243,0.786251,0.995637,1
19,2,0.1,"(1, 2)",0.99243,0.786251,0.924576,1
49,2,1.0,"(1, 2)",0.99243,0.786251,0.934312,1
18,2,0.1,"(1, 1)",0.99243,0.780403,0.840764,4
48,2,1.0,"(1, 1)",0.99243,0.780403,0.863084,4
33,2,0.5,"(1, 1)",0.99243,0.780403,0.977097,4
45,1,1.0,"(1, 1)",0.997083,0.777073,1.043903,7
15,1,0.1,"(1, 1)",0.997083,0.777073,1.12492,7
30,1,0.5,"(1, 1)",0.997083,0.777073,1.120552,7
46,1,1.0,"(1, 2)",0.997153,0.775427,1.87498,10


In [49]:
print("Best parameters set:")
print(tfidf_grid_searcher.best_estimator_[0])
print(tfidf_grid_searcher.best_params_)
cv_results = tfidf_grid_searcher.cv_results_
cv_results

Best parameters set:
TfidfVectorizer(max_df=0.1, min_df=2, ngram_range=(1, 2), stop_words='english')
{'tfidf__max_df': 0.1, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}


{'mean_fit_time': array([1.00380949, 1.76533845, 1.68783314, 0.8711585 , 0.96136699,
        0.54716791, 0.84429421, 0.85213382, 0.3358749 , 0.7433861 ,
        0.78840739, 0.26329   , 0.74374199, 0.77351608, 0.2488749 ,
        1.12491958, 1.70673708, 1.73054934, 0.84076398, 0.92457587,
        0.46183705, 0.73825492, 0.87356343, 0.32809462, 0.73607857,
        0.81073628, 0.28995964, 0.7555416 , 0.77596477, 0.25302533,
        1.12055159, 1.88589304, 1.81046646, 0.97709686, 0.99563691,
        0.46864809, 0.82107118, 0.91144436, 0.34188598, 0.73033026,
        0.78173634, 0.27642713, 0.73009855, 0.76489541, 0.2278364 ,
        1.04390335, 1.8749796 , 1.81763288, 0.86308353, 0.934312  ,
        0.5103083 , 0.855807  , 0.90996436, 0.35579603, 0.77840706,
        0.87090118, 0.32290891, 0.75712017, 0.70953366, 0.24616344]),
 'std_fit_time': array([0.04221072, 0.05026874, 0.0870811 , 0.06279129, 0.04165447,
        0.03546871, 0.04061509, 0.03053882, 0.0091335 , 0.02643231,
        0.040

### Characterize the vocabulary for this optimal vectorizer

In [45]:
best_tfidf_vectorizer = tfidf_grid_searcher.best_estimator_[0]
best_tfidf_vectorizer.fit_transform(x_train_text)
x_train_text_tf_best = best_tfidf_vectorizer.transform(x_train_text).toarray()
len(best_tfidf_vectorizer.vocabulary_)

4255

In [46]:
best_tfidf_vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 0.1,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

# Using pipeline to try many CountVectorizer hyperparameters

In [56]:
count_pipeline = Pipeline([
    ('count', CountVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE)),
 ])
count_parameters = {
    'count__min_df': np.arange(1, 5),
    'count__max_df': (0.05, 0.1, 0.5),
    'count__ngram_range': [(1, 1), (1, 2), (2,2)],
}

count_grid_searcher = GridSearchCV(
    count_pipeline, 
    count_parameters, 
    cv=7, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
count_grid_searcher.fit(x_train_text, y_train)

print("Best parameters set:")
print(count_grid_searcher.best_estimator_.steps)


Fitting 7 folds for each of 36 candidates, totalling 252 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   16.5s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 252 out of 252 | elapsed:  2.4min finished


Best parameters set:
[('count', CountVectorizer(max_df=0.1, min_df=2, ngram_range=(1, 2), stop_words='english')), ('clf', RandomForestClassifier(random_state=123))]


In [57]:
print_gridsearch_results(count_grid_searcher, list(count_parameters.keys()))

Dataframe has shape: (36, 27)
Number of trials used in grid search:  36


Unnamed: 0,param_count__min_df,param_count__max_df,param_count__ngram_range,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
28,2,0.5,"(1, 2)",0.992361,0.787487,1.303181,1
16,2,0.1,"(1, 2)",0.992361,0.787487,1.731245,1
27,2,0.5,"(1, 1)",0.992361,0.784983,1.483919,3
15,2,0.1,"(1, 1)",0.992361,0.784983,1.627279,3
24,1,0.5,"(1, 1)",0.997014,0.776243,2.918857,5
12,1,0.1,"(1, 1)",0.997014,0.776243,1.184831,5
25,1,0.5,"(1, 2)",0.997153,0.772493,4.067247,7
13,1,0.1,"(1, 2)",0.997153,0.772493,2.048632,7
31,3,0.5,"(1, 2)",0.980972,0.769149,1.060998,9
19,3,0.1,"(1, 2)",0.980972,0.769149,1.538422,9


In [58]:
best_count_vectorizer = count_grid_searcher.best_estimator_[0]
best_count_vectorizer.fit_transform(x_train_text)
x_train_text_count_best = best_count_vectorizer.transform(x_train_text).toarray()
len(best_count_vectorizer.vocabulary_)

2131

In [51]:
len(x_train_text_count_best[0])

2131

### Archive 

In [58]:
# skf = sklearn.model_selection.StratifiedKFold(n_splits=5)
# count_tf_scores = [[],[]]
# count_tf_train_time = [[],[]]

# for train_index, test_index in skf.split(x_train_text, y_train): 
#     ## Count
#     #
#     count_randforest = RandomForestClassifier(random_state=RANDOM_STATE)
#     # Time the training process
#     start_time_sec = time.time()
#     count_randforest.fit(x_train_text_count[train_index], y_train[train_index])
#     elapsed_time_sec = time.time() - start_time_sec
#     # Get accuracy scores for this split
#     count_y_test_pred = count_randforest.predict(x_train_text_count[test_index])
#     count_balanced_acc = sklearn.metrics.balanced_accuracy_score(
#         y_true=y_train[test_index], 
#         y_pred=count_y_test_pred
#     )
#     print("Performance of the count_vectorized random forest")
#     print(count_balanced_acc)
#     count_tf_scores[0].append(count_balanced_acc)
#     count_tf_train_time[0].append(elapsed_time_sec)
    
#     # TFIDF
#     # 
#     tf_randforest = RandomForestClassifier(random_state=RANDOM_STATE)
#     # Time the training process
#     start_time_sec = time.time()    
#     tf_randforest.fit(x_train_text_tf[train_index], y_train[train_index])
#     elapsed_time_sec = time.time() - start_time_sec
#     # Get accuracy scores for this split
#     tf_y_test_pred=tf_randforest.predict(x_train_text_tf[test_index])
    
#     tf_balanced_acc = sklearn.metrics.balanced_accuracy_score(
#         y_true=y_train[test_index], 
#         y_pred=tf_y_test_pred
#     )
#     print("Performance of the tfidf_vectorized random forest")    
#     print(tf_balanced_acc)
#     count_tf_scores[1].append(tf_balanced_acc)
#     count_tf_train_time[1].append(elapsed_time_sec)
    
# print("Best overall")
# print(f"type:  | score | time ")
# print(f"count: | %5.3f | %4.3f" % (np.mean(count_tf_scores[0]), np.mean(count_tf_train_time[0])))
# print(f"tfidf: | %5.3f | %4.3f" % (np.mean(count_tf_scores[1]), np.mean(count_tf_train_time[1])))
