In [33]:
import numpy as np
import pandas as pd
import os
import sklearn
import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


RANDOM_STATE = 123

# Setup: Importing the Text


In [34]:
data_dir = '../data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Basic comparison of two vectorizers - one with counts and one using tfidf

In [35]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit_transform(x_train_text)
x_train_text_count = count_vectorizer.transform(x_train_text).toarray()


In [36]:
len(count_vectorizer.vocabulary_)

4255

In [37]:
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit_transform(x_train_text)
x_train_text_tf = tf_vectorizer.transform(x_train_text).toarray()

In [38]:
len(tf_vectorizer.vocabulary_)

4255

## Train two Random Forests to compare performance

This manual approach to training models is no longer neceesary, as the pipeline enables a much speedier and more effective way of performing this comparison while also tuning hyperparameters.


In [39]:
# skf = sklearn.model_selection.StratifiedKFold(n_splits=5)
# count_tf_scores = [[],[]]
# count_tf_train_time = [[],[]]

# for train_index, test_index in skf.split(x_train_text, y_train): 
#     ## Count
#     #
#     count_randforest = RandomForestClassifier(random_state=RANDOM_STATE)
#     # Time the training process
#     start_time_sec = time.time()
#     count_randforest.fit(x_train_text_count[train_index], y_train[train_index])
#     elapsed_time_sec = time.time() - start_time_sec
#     # Get accuracy scores for this split
#     count_y_test_pred = count_randforest.predict(x_train_text_count[test_index])
#     count_balanced_acc = sklearn.metrics.balanced_accuracy_score(
#         y_true=y_train[test_index], 
#         y_pred=count_y_test_pred
#     )
#     print("Performance of the count_vectorized random forest")
#     print(count_balanced_acc)
#     count_tf_scores[0].append(count_balanced_acc)
#     count_tf_train_time[0].append(elapsed_time_sec)
    
#     # TFIDF
#     # 
#     tf_randforest = RandomForestClassifier(random_state=RANDOM_STATE)
#     # Time the training process
#     start_time_sec = time.time()    
#     tf_randforest.fit(x_train_text_tf[train_index], y_train[train_index])
#     elapsed_time_sec = time.time() - start_time_sec
#     # Get accuracy scores for this split
#     tf_y_test_pred=tf_randforest.predict(x_train_text_tf[test_index])
    
#     tf_balanced_acc = sklearn.metrics.balanced_accuracy_score(
#         y_true=y_train[test_index], 
#         y_pred=tf_y_test_pred
#     )
#     print("Performance of the tfidf_vectorized random forest")    
#     print(tf_balanced_acc)
#     count_tf_scores[1].append(tf_balanced_acc)
#     count_tf_train_time[1].append(elapsed_time_sec)
    
# print("Best overall")
# print(f"type:  | score | time ")
# print(f"count: | %5.3f | %4.3f" % (np.mean(count_tf_scores[0]), np.mean(count_tf_train_time[0])))
# print(f"tfidf: | %5.3f | %4.3f" % (np.mean(count_tf_scores[1]), np.mean(count_tf_train_time[1])))


### Using pipeline to try many tf-idf hyperparameters

In [40]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE)),
 ])
parameters = {
    'tfidf__min_df': np.arange(1,5),
    'tfidf__max_df': (0.05, 0.1, 0.5),
    'tfidf__ngram_range': [(1, 1), (1, 2)],
}

tfidf_grid_searcher = GridSearchCV(
    pipeline, 
    parameters, 
    cv=5, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy'
)
tfidf_grid_searcher.fit(x_train_text, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   10.7s
[Parallel(n_jobs=3)]: Done 120 out of 120 | elapsed:   39.5s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words='english')),
                                       ('clf',
                                        RandomForestClassifier(random_state=123))]),
             n_jobs=3,
             param_grid={'tfidf__max_df': (0.05, 0.1, 0.5),
                         'tfidf__min_df': array([1, 2, 3, 4]),
                         'tfidf__ngram_range': [(1, 1), (1, 2)]},
             scoring='balanced_accuracy', verbose=3)

In [41]:
tfidf_gsearch_results_df = pd.DataFrame(tfidf_grid_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(tfidf_gsearch_results_df.shape)))
tfidf_n_trials_grid_search = tfidf_gsearch_results_df.shape[0]

Dataframe has shape: (24, 16)


In [42]:
# Uncomment for exploring the columns in this grid search
# print("Dataframe has columns:")
# for c in gsearch_results_df.columns:
#     print("-- %s" % c)

In [55]:
tfidf_param_keys = ['param_tfidf__max_df', 'param_tfidf__min_df', 'param_tfidf__ngram_range']

# Rearrange row order so it is easy to skim
tfidf_gsearch_results_df.sort_values('rank_test_score', inplace=True)

tfidf_gsearch_results_df[tfidf_param_keys + ['mean_test_score', 'mean_fit_time', 'rank_test_score']]

Unnamed: 0,param_tfidf__max_df,param_tfidf__min_df,param_tfidf__ngram_range,mean_test_score,mean_fit_time,rank_test_score
16,0.5,1,"(1, 1)",0.77125,0.922695,1
8,0.1,1,"(1, 1)",0.77125,0.960489,1
18,0.5,2,"(1, 1)",0.762917,0.71905,3
10,0.1,2,"(1, 1)",0.762917,0.912067,3
11,0.1,2,"(1, 2)",0.762083,0.908493,5
19,0.5,2,"(1, 2)",0.762083,0.804433,5
21,0.5,3,"(1, 2)",0.75875,0.689086,7
13,0.1,3,"(1, 2)",0.75875,0.799291,7
20,0.5,3,"(1, 1)",0.757083,0.685579,9
12,0.1,3,"(1, 1)",0.757083,0.845756,9


In [56]:
print("Best parameters set:")
print(tfidf_grid_searcher.best_estimator_[0])
print(tfidf_grid_searcher.best_params_)
tfidf_grid_searcher.cv_results_


Best parameters set:
TfidfVectorizer(max_df=0.1, stop_words='english')
{'tfidf__max_df': 0.1, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1)}


{'mean_fit_time': array([0.93429585, 1.50092688, 0.77387552, 0.81365523, 0.77253776,
        0.80608625, 0.75610695, 0.70665698, 0.96048884, 1.88693528,
        0.91206698, 0.90849266, 0.84575615, 0.79929099, 0.64932785,
        0.69551797, 0.92269478, 1.50541005, 0.7190496 , 0.80443296,
        0.68557892, 0.68908577, 0.63044372, 0.65674491]),
 'std_fit_time': array([0.03668885, 0.07981282, 0.01024726, 0.01711552, 0.05640721,
        0.02997234, 0.03324778, 0.00783778, 0.03053296, 0.09449123,
        0.05375333, 0.03974249, 0.04588784, 0.05134829, 0.02083435,
        0.02360669, 0.02607113, 0.04707377, 0.01442488, 0.02895094,
        0.01835163, 0.01100202, 0.01158392, 0.01193786]),
 'mean_score_time': array([0.05987964, 0.07396998, 0.05827165, 0.05812635, 0.06669502,
        0.06472459, 0.06142225, 0.05270514, 0.05801921, 0.08298621,
        0.06065912, 0.05902157, 0.06272607, 0.052981  , 0.04719939,
        0.0532908 , 0.05791192, 0.06917262, 0.05270562, 0.06064548,
        0.049801

### Characterize the vocabulary for this optimal vectorizer

In [45]:
best_tfidf_vectorizer = tfidf_grid_searcher.best_estimator_[0]
best_tfidf_vectorizer.fit_transform(x_train_text)
x_train_text_tf_best = best_tfidf_vectorizer.transform(x_train_text).toarray()
len(best_tfidf_vectorizer.vocabulary_)

4255

In [46]:
best_tfidf_vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 0.1,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

# Using pipeline to try many CountVectorizer hyperparameters

In [47]:
pipeline = Pipeline([
    ('count', CountVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE)),
 ])
parameters = {
    'count__min_df': np.arange(1, 5),
    'count__max_df': (0.05, 0.1, 0.5),
    'count__ngram_range': [(1, 1), (1, 2)],
}

count_grid_searcher = GridSearchCV(
    pipeline, 
    parameters, 
    cv=5, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy'
)
count_grid_searcher.fit(x_train_text, y_train)

print("Best parameters set:")
print(count_grid_searcher.best_estimator_.steps)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    9.9s
[Parallel(n_jobs=3)]: Done 120 out of 120 | elapsed:   38.8s finished


Best parameters set:
[('count', CountVectorizer(max_df=0.1, min_df=2, ngram_range=(1, 2), stop_words='english')), ('clf', RandomForestClassifier(random_state=123))]


In [48]:
count_gsearch_results_df = pd.DataFrame(count_grid_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(count_gsearch_results_df.shape)))
count_n_trials_grid_search = count_gsearch_results_df.shape[0]

Dataframe has shape: (24, 16)


In [53]:
# count_param_keys = ['param_count__min_df', 'param_count__ngram_range']
count_param_keys = ['param_count__max_df', 'param_count__min_df', 'param_count__ngram_range']

# Rearrange row order so it is easy to skim
count_gsearch_results_df.sort_values('rank_test_score', inplace=True)

count_gsearch_results_df[count_param_keys + ['mean_test_score', 'mean_fit_time', 'rank_test_score']]

Unnamed: 0,param_count__max_df,param_count__min_df,param_count__ngram_range,mean_test_score,mean_fit_time,rank_test_score
11,0.1,2,"(1, 2)",0.761667,0.879845,1
19,0.5,2,"(1, 2)",0.761667,0.921846,1
16,0.5,1,"(1, 1)",0.759583,1.011387,3
8,0.1,1,"(1, 1)",0.759583,1.051694,3
18,0.5,2,"(1, 1)",0.758333,0.809431,5
10,0.1,2,"(1, 1)",0.758333,0.841102,5
17,0.5,1,"(1, 2)",0.753333,1.699623,7
9,0.1,1,"(1, 2)",0.753333,1.627155,7
22,0.5,4,"(1, 1)",0.7475,0.657698,9
14,0.1,4,"(1, 1)",0.7475,0.718839,9


In [54]:
best_count_vectorizer = count_grid_searcher.best_estimator_[0]
best_count_vectorizer.fit_transform(x_train_text)
x_train_text_count_best = best_count_vectorizer.transform(x_train_text).toarray()
len(best_count_vectorizer.vocabulary_)

2131

In [51]:
len(x_train_text_count_best[0])

2131