In [29]:
import numpy as np
import pandas as pd
import os
import sklearn
import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


RANDOM_STATE = 123

# Setup: Importing the Text


In [2]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Basic comparison of two vectorizers - one with counts and one using tfidf

In [7]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit_transform(x_train_text)
x_train_text_count = count_vectorizer.transform(x_train_text).toarray()


In [4]:
# count_vectorizer.vocabulary_

In [17]:
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit_transform(x_train_text)
x_train_text_tf = tf_vectorizer.transform(x_train_text).toarray()

In [18]:
# tf_vectorizer.vocabulary_

## Train two Random Forests to compare performance


In [24]:
skf = sklearn.model_selection.StratifiedKFold(n_splits=5)
count_tf_scores = [[],[]]
count_tf_train_time = [[],[]]

for train_index, test_index in skf.split(x_train_text, y_train): 
    ## Count
    #
    count_randforest = RandomForestClassifier(random_state=RANDOM_STATE)
    # Time the training process
    start_time_sec = time.time()
    count_randforest.fit(x_train_text_count[train_index], y_train[train_index])
    elapsed_time_sec = time.time() - start_time_sec
    # Get accuracy scores for this split
    count_y_test_pred = count_randforest.predict(x_train_text_count[test_index])
    count_balanced_acc = sklearn.metrics.balanced_accuracy_score(
        y_true=y_train[test_index], 
        y_pred=count_y_test_pred
    )
    print("Performance of the count_vectorized random forest")
    print(count_balanced_acc)
    count_tf_scores[0].append(count_balanced_acc)
    count_tf_train_time[0].append(elapsed_time_sec)
    
    # TFIDF
    # 
    tf_randforest = RandomForestClassifier(random_state=RANDOM_STATE)
    # Time the training process
    start_time_sec = time.time()    
    tf_randforest.fit(x_train_text_tf[train_index], y_train[train_index])
    elapsed_time_sec = time.time() - start_time_sec
    # Get accuracy scores for this split
    tf_y_test_pred=tf_randforest.predict(x_train_text_tf[test_index])
    
    tf_balanced_acc = sklearn.metrics.balanced_accuracy_score(
        y_true=y_train[test_index], 
        y_pred=tf_y_test_pred
    )
    print("Performance of the tfidf_vectorized random forest")    
    print(tf_balanced_acc)
    count_tf_scores[1].append(tf_balanced_acc)
    count_tf_train_time[1].append(elapsed_time_sec)
    
print("Best overall")
print(f"type:  | score | time ")
print(f"count: | %5.3f | %4.3f" % (np.mean(count_tf_scores[0]), np.mean(count_tf_train_time[0])))
print(f"tfidf: | %5.3f | %4.3f" % (np.mean(count_tf_scores[1]), np.mean(count_tf_train_time[1])))


Performance of the count_vectorized random forest
0.75
Performance of the tfidf_vectorized random forest
0.7541666666666667
Performance of the count_vectorized random forest
0.7770833333333333
Performance of the tfidf_vectorized random forest
0.7729166666666667
Performance of the count_vectorized random forest
0.7645833333333333
Performance of the tfidf_vectorized random forest
0.7395833333333333
Performance of the count_vectorized random forest
0.7666666666666666
Performance of the tfidf_vectorized random forest
0.76875
Performance of the count_vectorized random forest
0.7541666666666667
Performance of the tfidf_vectorized random forest
0.7770833333333333
Best overall
type:  | score | time 
count: | 0.762 | 6.820
tfidf: | 0.762 | 7.059


### Using pipeline to try many tf-idf hyperparameters

In [102]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE)),
 ])
parameters = {
    'tfidf__min_df': np.arange(1,5),
    'tfidf__max_df': (0.05, 0.1, 0.5),
    'tfidf__ngram_range': [(1, 1), (1, 2)],
}

grid_searcher = GridSearchCV(
    pipeline, 
    parameters, 
    cv=5, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy'
)
grid_searcher.fit(x_train_text, y_train)

print("Best parameters set:")
print(grid_searcher.best_estimator_.steps)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    8.0s
[Parallel(n_jobs=3)]: Done 120 out of 120 | elapsed:   34.6s finished


Best parameters set:
[('tfidf', TfidfVectorizer(max_df=0.1, stop_words='english')), ('clf', RandomForestClassifier(random_state=123))]


In [103]:
gsearch_results_df = pd.DataFrame(grid_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(gsearch_results_df.shape)))
n_trials_grid_search = gsearch_results_df.shape[0]

Dataframe has shape: (24, 16)


In [104]:
# Uncomment for exploring the columns in this grid search
# print("Dataframe has columns:")
# for c in gsearch_results_df.columns:
#     print("-- %s" % c)
    

In [105]:
param_keys = ['param_tfidf__max_df', 'param_tfidf__min_df', 'param_tfidf__ngram_range']

# Rearrange row order so it is easy to skim
# gsearch_results_df.sort_values(param_keys, inplace=True)

gsearch_results_df[param_keys + ['mean_test_score', 'mean_fit_time', 'rank_test_score']]

Unnamed: 0,param_tfidf__max_df,param_tfidf__min_df,param_tfidf__ngram_range,mean_test_score,mean_fit_time,rank_test_score
0,0.05,1,"(1, 1)",0.715833,0.831436,21
1,0.05,1,"(1, 2)",0.722917,1.33239,17
2,0.05,2,"(1, 1)",0.719583,0.664433,18
3,0.05,2,"(1, 2)",0.71375,0.682512,23
4,0.05,3,"(1, 1)",0.718333,0.59187,20
5,0.05,3,"(1, 2)",0.71875,0.627004,19
6,0.05,4,"(1, 1)",0.714167,0.592768,22
7,0.05,4,"(1, 2)",0.70625,0.690791,24
8,0.1,1,"(1, 1)",0.77125,0.977652,1
9,0.1,1,"(1, 2)",0.75625,1.482453,11


# Using pipeline to try many CountVectorizer hyperparameters

In [98]:
pipeline = Pipeline([
    ('count', CountVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE)),
 ])
parameters = {
    'count__min_df': np.arange(1, 5),
    'count__max_df': (0.05, 0.1, 0.5),
    'count__ngram_range': [(1, 1), (1, 2)],
}

grid_searcher = GridSearchCV(
    pipeline, 
    parameters, 
    cv=5, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy'
)
grid_searcher.fit(x_train_text, y_train)

print("Best parameters set:")
print(grid_searcher.best_estimator_.steps)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    8.9s
[Parallel(n_jobs=3)]: Done 120 out of 120 | elapsed:   34.6s finished


Best parameters set:
[('count', CountVectorizer(max_df=0.1, min_df=2, ngram_range=(1, 2), stop_words='english')), ('clf', RandomForestClassifier(random_state=123))]


In [99]:
gsearch_results_df = pd.DataFrame(grid_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(gsearch_results_df.shape)))
n_trials_grid_search = gsearch_results_df.shape[0]

Dataframe has shape: (24, 16)


In [100]:
# Uncomment for exploring the columns in this grid search
# print("Dataframe has columns:")
# for c in gsearch_results_df.columns:
#     print("-- %s" % c)
    

In [101]:
param_keys = ['param_count__max_df', 'param_count__min_df', 'param_count__ngram_range']

# Rearrange row order so it is easy to skim
# gsearch_results_df.sort_values(param_keys, inplace=True)

gsearch_results_df[param_keys + ['mean_test_score', 'mean_fit_time', 'rank_test_score']]

Unnamed: 0,param_count__max_df,param_count__min_df,param_count__ngram_range,mean_test_score,mean_fit_time,rank_test_score
0,0.05,1,"(1, 1)",0.72125,0.949852,18
1,0.05,1,"(1, 2)",0.731667,1.51631,17
2,0.05,2,"(1, 1)",0.7175,0.706207,20
3,0.05,2,"(1, 2)",0.72,0.768199,19
4,0.05,3,"(1, 1)",0.708333,0.66003,21
5,0.05,3,"(1, 2)",0.707917,0.713392,22
6,0.05,4,"(1, 1)",0.705,0.637888,24
7,0.05,4,"(1, 2)",0.70625,0.654683,23
8,0.1,1,"(1, 1)",0.759583,0.977225,3
9,0.1,1,"(1, 2)",0.753333,1.479269,7
