### Data Wrangling

In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

root_dir = "/content/gdrive/My Drive/NLP Sentiment Analysis"

Mounted at /content/gdrive


In [None]:
with open(f'{root_dir}/train_neg_reviews.txt') as f:
  contents = f.read()
  train_neg_reviews = [review for review in contents.split('\n')]

with open(f'{root_dir}/train_pos_reviews.txt') as f:
  contents = f.read()
  train_pos_reviews = [review for review in contents.split('\n')]

with open(f'{root_dir}/test_neg_reviews.txt') as f:
  contents = f.read()
  test_neg_reviews = [review for review in contents.split('\n')]

with open(f'{root_dir}/test_pos_reviews.txt') as f:
  contents = f.read()
  test_pos_reviews = [review for review in contents.split('\n')]

In [None]:
import pandas as pd
reviews = train_neg_reviews + test_neg_reviews + train_pos_reviews + test_pos_reviews
scores = [int(review.split('\t')[0] or 3) for review in reviews]
reviews_text = [''.join(review.split('\t')[1:]) for review in reviews]
classification = [0]*len(train_neg_reviews + test_neg_reviews) + [1]*len(train_pos_reviews + test_pos_reviews)
df = pd.DataFrame({'review': reviews_text, 'score': scores, 'classification': classification})
df = df.sample(frac=1, random_state=0) # shuffle

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(df.review)

TfidfVectorizer()

In [None]:
tfidf_embeddings = vectorizer.transform(df.review)

A random forest is a type of ensemble machine learning model that is made up of multiple decision trees. Ensemble models combine the predictions of multiple individual models to make more accurate predictions. In a random forest, each decision tree is trained on a random subset of the data, and the final prediction is made by averaging the predictions of all the individual decision trees.

Here is an example of how to train a random forest using the scikit-learn library in Python:


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_embeddings, df.classification, random_state=0)

## Model Fitting

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier with 100 trees
model = RandomForestClassifier()

# Train the model on training data
model.fit(X_train, y_train)

# Score
model.score(X_test, y_test)

0.8398037542662116

In [None]:
from sklearn.metrics import f1_score, classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84      4691
           1       0.84      0.83      0.84      4685

    accuracy                           0.84      9376
   macro avg       0.84      0.84      0.84      9376
weighted avg       0.84      0.84      0.84      9376



## Hyperparameter Searching

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for the model
param_grid = {
    'n_estimators': [10, 100, 1000],
    'max_depth': [5, 10, 50, 100],
    'min_impurity_decrease': [0, 0.1, 1],
    'max_features': [1, 10, 100, 1000, None]
}

model_grid = RandomForestClassifier()

# Use GridSearchCV to search for the best hyperparameters
clf = GridSearchCV(model_grid, param_grid, cv=5)


# clf.fit(X, y)

# Print the best hyperparameters
# print(f"Best hyperparameters: {clf.best_params_}. Score: {clf.best_score_:.2f}")

## Contextual Polarity

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression()
model.fit(X_train, y_train)
feature_names = np.array(vectorizer.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()
print("Negative Words", feature_names[sorted_coef_index[:10]])
print("Positive Words", feature_names[sorted_coef_index[-10:]])

## More Models!

In [None]:
# Import the necessary libraries
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification

# Create a gradient boosting classifier
clf = GradientBoostingClassifier()

# Train the classifier on the data
clf.fit(X_train, y_train)

# Make predictions on new data
clf.score(X_test, y_test)

0.8095352371810255

In [None]:
import xgboost as xgb

# Create the XGBoost model
model = xgb.XGBClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## More Advanced Embeddings

In [None]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.25.0.tar.gz (44 kB)
[K     |████████████████████████████████| 44 kB 1.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-1.5.2.221213-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 9.0 MB/s 
Collecting types-pytz>=2022.1.1
  Downloading types_pytz-2022.6.0.1-py3-none-any.whl (4.7 kB)
Building wheels for collected packages: openai
  Building wheel for openai (PEP 517) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.25.0-py3-none-any.whl size=55880 sha256=66035a4fd7ab0e466943ebfcba64b51cc647fb9ad3bf4dcd04e6c5f411eba221
  Stored in directory: /root/.cache/pip/wheels/4b/92/33/6f57c7aae0b16875267999a50570e81f15eecec577ebe0

In [None]:
import openai
openai.api_key = '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'
from openai.embeddings_utils import cosine_similarity, get_embeddings as _get_embeddings, get_embedding as _get_embedding
get_embeddings = lambda x: _get_embeddings(x, 'text-embedding-ada-002')
get_embedding = lambda x: _get_embedding(x, 'text-embedding-ada-002')
sub = df.iloc[:10000]
ada_embeddings = []
for i in range(0, 2048, 2048):
  ada_embeddings_new = get_embeddings(sub.iloc[i:i+2048].review)
  ada_embeddings.extend(ada_embeddings_new)
sub['ada_embeddings'] = ada_embeddings
sub.to_csv('embedded_reviews.csv', index=False)

ValueError: ignored

In [None]:
sub

In [None]:
pd.read_csv('embedded_reviews.csv')

Unnamed: 0,review,score,classification,ada_embeddings
0,Nothing to say but Wow! Has anyone actually ha...,4,0,"[-0.01144568994641304, -0.029017241671681404, ..."
1,Rated TV-14 for Sexual Content and Language.<b...,9,1,"[0.019637495279312134, -0.03837515786290169, -..."


In [None]:
(len(' '.join(df.review.values))/1000)*0.0004

19.582152

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(np.array(ada_embeddings), sub.classification)
model = RandomForestClassifier(max_depth=10)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.904

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90       255
           1       0.89      0.90      0.90       245

    accuracy                           0.90       500
   macro avg       0.90      0.90      0.90       500
weighted avg       0.90      0.90      0.90       500

