In [1]:
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 123

# Setup: Importing the Text


In [2]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Basic comparison of two vectorizers - one with counts and one using tfidf

In [7]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit_transform(x_train_text)
x_train_text_count = count_vectorizer.transform(x_train_text).toarray()


In [4]:
# count_vectorizer.vocabulary_

In [17]:
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit_transform(x_train_text)
x_train_text_tf = tf_vectorizer.transform(x_train_text).toarray()

In [18]:
# tf_vectorizer.vocabulary_

## Train two Random Forests to compare performance


In [19]:
skf = sklearn.model_selection.StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(x_train_text, y_train): 
    count_randforest = RandomForestClassifier(random_state=RANDOM_STATE)
    count_randforest.fit(x_train_text_count[train_index], y_train[train_index])
    count_y_test_pred = count_randforest.predict(x_train_text_count[test_index])

    tf_randforest = RandomForestClassifier(random_state=RANDOM_STATE)
    tf_randforest.fit(x_train_text_tf[train_index], y_train[train_index])
    tf_y_test_pred=tf_randforest.predict(x_train_text_tf[test_index])
    
    print("Performance of the count_vectorized random forest")
    print(sklearn.metrics.balanced_accuracy_score(
        y_true=y_train[test_index], 
        y_pred=count_y_test_pred
    ))
    print("Performance of the tfidf_vectorized random forest")    
    print(sklearn.metrics.balanced_accuracy_score(
        y_true=y_train[test_index], 
        y_pred=tf_y_test_pred
    ))
    print()

Performance of the count_vectorized random forest
0.75
Performance of the tfidf_vectorized random forest
0.7541666666666667

Performance of the count_vectorized random forest
0.7770833333333333
Performance of the tfidf_vectorized random forest
0.7729166666666667

Performance of the count_vectorized random forest
0.7645833333333333
Performance of the tfidf_vectorized random forest
0.7395833333333333

Performance of the count_vectorized random forest
0.7666666666666666
Performance of the tfidf_vectorized random forest
0.76875

Performance of the count_vectorized random forest
0.7541666666666667
Performance of the tfidf_vectorized random forest
0.7770833333333333

