### Data Wrangling

In [16]:
with open('train_neg_reviews.txt',encoding='utf-8') as f:
  contents = f.read()
  train_neg_reviews = [review for review in contents.split('\n')]

with open('train_pos_reviews.txt',encoding='utf-8') as f:
  contents = f.read()
  train_pos_reviews = [review for review in contents.split('\n')]

with open('test_neg_reviews.txt',encoding='utf-8') as f:
  contents = f.read()
  test_neg_reviews = [review for review in contents.split('\n')]

with open('train_pos_reviews.txt',encoding='utf-8') as f:
  contents = f.read()
  test_pos_reviews = [review for review in contents.split('\n')]

In [17]:
import pandas as pd
reviews = train_neg_reviews + test_neg_reviews + train_pos_reviews + test_pos_reviews
scores = [int(review.split('\t')[0] or 3) for review in reviews]
reviews_text = [''.join(review.split('\t')[1:]) for review in reviews]
classification = [0]*len(train_neg_reviews + test_neg_reviews) + [1]*len(train_pos_reviews + test_pos_reviews)
df = pd.DataFrame({'review': reviews_text, 'score': scores, 'classification': classification})
df = df.sample(frac=1, random_state=0) # shuffle

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(df.review)

TfidfVectorizer()

In [19]:
tfidf_embeddings = vectorizer.transform(df.review)

A random forest is a type of ensemble machine learning model that is made up of multiple decision trees. Ensemble models combine the predictions of multiple individual models to make more accurate predictions. In a random forest, each decision tree is trained on a random subset of the data, and the final prediction is made by averaging the predictions of all the individual decision trees.

Here is an example of how to train a random forest using the scikit-learn library in Python:


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_embeddings, df.classification, random_state=0)

## Model Fitting

In [21]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier with 100 trees
model = RandomForestClassifier()

# Train the model on training data
model.fit(X_train, y_train)

# Score
model.score(X_test, y_test)

0.9282057844893781

In [22]:
from sklearn.metrics import f1_score, classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      4707
           1       0.97      0.84      0.90      3107

    accuracy                           0.93      7814
   macro avg       0.94      0.91      0.92      7814
weighted avg       0.93      0.93      0.93      7814



## Hyperparameter Searching

In [23]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for the model
param_grid = {
    'n_estimators': [10, 100, 1000],
    'max_depth': [5, 10, 50, 100],
    'min_impurity_decrease': [0, 0.1, 1],
    'max_features': [1, 10, 100, 1000, None]
}

model_grid = RandomForestClassifier()

# Use GridSearchCV to search for the best hyperparameters
clf = GridSearchCV(model_grid, param_grid, cv=5)


# clf.fit(X, y)

# Print the best hyperparameters
# print(f"Best hyperparameters: {clf.best_params_}. Score: {clf.best_score_:.2f}")

## Contextual Polarity

In [24]:
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression()
model.fit(X_train, y_train)
feature_names = np.array(vectorizer.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()
print("Negative Words", feature_names[sorted_coef_index[:10]])
print("Positive Words", feature_names[sorted_coef_index[-10:]])

Negative Words ['worst' 'bad' 'awful' 'boring' 'poor' 'waste' 'terrible' 'no' 'nothing'
 'dull']
Positive Words ['still' 'loved' 'today' 'fun' 'wonderful' 'amazing' 'perfect' 'excellent'
 'best' 'great']




## More Models!

In [25]:
# Import the necessary libraries
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification

# Create a gradient boosting classifier
clf = GradientBoostingClassifier()

# Train the classifier on the data
clf.fit(X_train, y_train)

# Make predictions on new data
clf.score(X_test, y_test)

0.8287688763757358

In [26]:
import xgboost as xgb

# Create the XGBoost model
model = xgb.XGBClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



Accuracy: 90.41%


## More Advanced Embeddings

In [27]:
# !pip install openai
# import openai
# openai.api_key = # GET THIS FROM JOSIAH IF NEEDED
# from openai.embeddings_utils import cosine_similarity, get_embeddings as _get_embeddings, get_embedding as _get_embedding
# get_embeddings = lambda x: _get_embeddings(x, 'text-embedding-ada-002')
# get_embedding = lambda x: _get_embedding(x, 'text-embedding-ada-002')
# sub = df.iloc[:2000]
# sub['ada_embeddings'] = get_embeddings(sub.review)
# sub.to_csv('embedded_reviews.csv', index=False)

In [28]:
sub = pd.read_csv('embedded_reviews.csv')
sub.ada_embeddings = sub.ada_embeddings.apply(eval)

In [29]:
X = np.array([emb for emb in sub.ada_embeddings.values])
X.shape

(2000, 1536)

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, sub.classification)
model = RandomForestClassifier(max_depth=5)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.878

In [31]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88       256
           1       0.86      0.90      0.88       244

    accuracy                           0.88       500
   macro avg       0.88      0.88      0.88       500
weighted avg       0.88      0.88      0.88       500



In [32]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for the model
param_grid = {
    'n_estimators': [10, 100, 1000],
    'max_depth': [5, 10, 50, 100],
    'min_impurity_decrease': [0, 0.1, 1],
    'max_features': [1, 10, 100, 1000, None]
}

model_grid = RandomForestClassifier()

# Use GridSearchCV to search for the best hyperparameters
clf = GridSearchCV(model_grid, param_grid, cv=5)


clf.fit(X, sub.classification)

# Print the best hyperparameters
print(f"Best hyperparameters: {clf.best_params_}. Score: {clf.best_score_:.2f}")

Best hyperparameters: {'max_depth': 10, 'max_features': 100, 'min_impurity_decrease': 0, 'n_estimators': 1000}. Score: 0.89
