In [29]:
# import libraries

import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV


In [30]:
# load review sentiment data

review_df = pd.read_csv('data/review_sentiment.csv')

review_df

Unnamed: 0,review_id,text,sentiment
0,LLzom-2TITa4gasV7_fCCA,Great experience purchasing a washer and dryer...,1
1,a5JHzBrWxRd_OmIvV7znDA,Went here based on the high ratings and raves ...,-1
2,X-o--dwf0HuFMittYi4wCA,"oh Millers, how i wanted to like you. You are...",-1
3,INGNbsyo-MouZZzcxnCSGQ,This place gets two stars from me only because...,-1
4,k7VatXVLism-cTDJE8TTUw,"This place was awesome. Clean, beautiful and t...",1
...,...,...,...
11681,IlU-MQzMKc7jAHWwK5VFGQ,"To be fair, I tried them in their first week. ...",0
11682,Qt3BsRvQuJccDQfFWM1XPw,Awful place. It's dirty. Had two birthday part...,-1
11683,3CQQ8Im_UX6QqDECuXYK8A,A truly vegetarian delight! I took a Jewish f...,1
11684,ery1nBM7zKweFLBe-bT5ag,I have a 2011 Toyota Sienna Limited. During th...,-1


In [31]:
feature_set = {
    0: 'bag_of_words',
    1: 'one_hot',
    2: 'n_grams',
    3: 'tf_idf',
    4: 'word2vec',
    5: 'combined_features'
}
# Load all feature sets
features = {}
for key, feature_name in feature_set.items():
    if key == 4:
        features[key] = np.load('features/' + feature_name + '.npy')
    else:
        features[key] = sparse.load_npz('features/' + feature_name + '.npz')

In [32]:
# target labels

y = review_df['sentiment'].to_numpy()

y.shape

(11686,)

In [33]:
# classifiers

classifiers = {
    # 'gaussian_nb': GaussianNB(),
    # 'decision_tree': DecisionTreeClassifier(),
    # 'random_forest': RandomForestClassifier(),
    # 'svm': SVC(),
    # 'perceptron': Perceptron(tol=1e-3, random_state=0),
    'xgb': XGBClassifier(),
    'logistic_regression': LogisticRegression(max_iter=1000, random_state=0)
}

In [34]:
param_grids = {
    # 'gaussian_nb': {},
    # 'decision_tree': {
    #     'max_depth': [None, 10, 20]
    # },
    # 'random_forest': {
    #     'n_estimators': [100, 200],
    #     'max_depth': [None, 10]
    # },
    # 'svm': {
    #     'C': [0.1, 1.0],
    #     'kernel': ['linear', 'rbf']
    # },
    # 'perceptron': {
    #     'alpha': [0.0001, 0.001],
    #     'penalty': [None, 'l2']
    # },
    'xgb': {
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 200]
    },
    'logistic_regression': {
        'C': [0.1, 1.0],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
}


In [35]:
# Load word2vec features for grid search
word2vec_features = features[4]

In [36]:
# Split data into training and test sets using word2vec features
X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(word2vec_features, y, test_size=0.20, random_state=42)

# Oversample to balance the classes
oversampler = RandomOverSampler(random_state=42)
X_train_w2v_resampled, y_train_resampled = oversampler.fit_resample(X_train_w2v, y_train)



In [37]:

# Convert numpy arrays to pandas Series
y_train_series = pd.Series(y_train)
y_train_resampled_series = pd.Series(y_train_resampled)
y_test_series = pd.Series(y_test)

# Remap labels: -1 -> 0, 0 -> 1, 1 -> 2
y_train_mapped = y_train_series.map({-1: 0, 0: 1, 1: 2})
y_train_resampled_mapped = y_train_resampled_series.map({-1: 0, 0: 1, 1: 2})
y_test_mapped = y_test_series.map({-1: 0, 0: 1, 1: 2})


In [38]:
best_estimators = {}

# Apply grid search to each classifier using word2vec features
for clf_name, clf in classifiers.items():
    print(f"Grid search for {clf_name}")
    param_grid = param_grids[clf_name]
    if clf_name == 'xgb':
        grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=3)
        grid_search.fit(X_train_w2v, y_train_mapped)
    else:
        grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=3)
        grid_search.fit(X_train_w2v, y_train)
    best_estimators[clf_name] = grid_search.best_estimator_
    print(f"Best parameters for {clf_name}: {grid_search.best_params_}")
    print(f"Best accuracy for {clf_name}: {grid_search.best_score_}")
    print("---------------------------------------------")


Grid search for xgb
