### Libraries

# Modelling

In [27]:
# import libraries

import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

### Data

In [28]:
# load review sentiment data

review_df = pd.read_csv('data/review_sentiment.csv')

review_df

Unnamed: 0,review_id,text,sentiment
0,LLzom-2TITa4gasV7_fCCA,Great experience purchasing a washer and dryer...,1
1,a5JHzBrWxRd_OmIvV7znDA,Went here based on the high ratings and raves ...,-1
2,X-o--dwf0HuFMittYi4wCA,"oh Millers, how i wanted to like you. You are...",-1
3,INGNbsyo-MouZZzcxnCSGQ,This place gets two stars from me only because...,-1
4,k7VatXVLism-cTDJE8TTUw,"This place was awesome. Clean, beautiful and t...",1
...,...,...,...
11681,IlU-MQzMKc7jAHWwK5VFGQ,"To be fair, I tried them in their first week. ...",0
11682,Qt3BsRvQuJccDQfFWM1XPw,Awful place. It's dirty. Had two birthday part...,-1
11683,3CQQ8Im_UX6QqDECuXYK8A,A truly vegetarian delight! I took a Jewish f...,1
11684,ery1nBM7zKweFLBe-bT5ag,I have a 2011 Toyota Sienna Limited. During th...,-1


Load matrix

In [29]:
feature_set = {
    0: 'bag_of_words',
    1: 'one_hot',
    2: 'n_grams',
    3: 'tf_idf',
    4: 'word2vec',
    5: 'combined_bow_negation',
    6: 'lsa_topic_matrix',
    7: 'lda_topic_matrix'
}
# Load all feature sets
features = {}
for key, feature_name in feature_set.items():
    if key == 4 or key == 6 or key == 7:
        features[key] = np.load('features/' + feature_name + '.npy')
    else:
        features[key] = sparse.load_npz('features/' + feature_name + '.npz')

In [30]:
# target labels

y = review_df['sentiment'].to_numpy()

y.shape

(11686,)

Classifiers

In [31]:
# classifiers

classifiers = {
    'gaussian_nb': GaussianNB(),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier(),
    'svm': SVC(),
    'perceptron': Perceptron(tol=1e-3, random_state=0),
    'xgb': XGBClassifier(),
    'logistic_regression': LogisticRegression(max_iter=1000, random_state=0)
}

In [32]:
param_grids = {
    'gaussian_nb': {},
    'decision_tree': {
        'max_depth': [None, 10, 20]
    },
    'random_forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10]
    },
    'svm': {
        'C': [0.1, 1.0],
        'kernel': ['linear', 'rbf']
    },
    'perceptron': {
        'alpha': [0.0001, 0.001],
        'penalty': [None, 'l2']
    },
    'xgb': {
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 200]
    },
    'logistic_regression': {
        'C': [0.1, 1.0],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
}


In [33]:
# Load word2vec features for grid search
word2vec_features = features[4]


In [34]:
# Split data into training and test sets using word2vec features
X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(word2vec_features, y, test_size=0.20, random_state=42)


In [35]:
# xgb algorithm expects [0 1 2], not [-1 0 1]

# Convert numpy arrays to pandas Series
y_train_series = pd.Series(y_train)
y_test_series = pd.Series(y_test)

# Remap labels: -1 -> 0, 0 -> 1, 1 -> 2
y_train_mapped = y_train_series.map({-1: 0, 0: 1, 1: 2})
y_test_mapped = y_test_series.map({-1: 0, 0: 1, 1: 2})

Cross validation

In [36]:
cv_scores = {}

# Cross-validation for each classifier
for clf_name, clf in classifiers.items():
    print(f"Cross-validating {clf_name}...")
    if clf_name == 'xgb':
        cv_results = cross_validate(clf, X_train_w2v, y_train_mapped, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=5)

    else:
        cv_results = cross_validate(clf, X_train_w2v, y_train, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=5)

    cv_scores[clf_name] = cv_results

for clf_name, cv_result in cv_scores.items():
    print(f"Classifier: {clf_name}")
    print("Accuracy:", cv_result['test_accuracy'].mean())
    print("Precision (Macro):", cv_result['test_precision_macro'].mean())
    print("Recall (Macro):", cv_result['test_recall_macro'].mean())
    print("F1 Score (Macro):", cv_result['test_f1_macro'].mean())
    print("-------------------------")

Cross-validating gaussian_nb...
Cross-validating decision_tree...
Cross-validating random_forest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Cross-validating svm...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Cross-validating perceptron...
Cross-validating xgb...
Cross-validating logistic_regression...
Classifier: gaussian_nb
Accuracy: 0.49775366735049487
Precision (Macro): 0.4680201446987492
Recall (Macro): 0.4919967844158806
F1 Score (Macro): 0.4497374358123659
-------------------------
Classifier: decision_tree
Accuracy: 0.516260289611248
Precision (Macro): 0.4132947214720045
Recall (Macro): 0.4135580374557807
F1 Score (Macro): 0.41329806193191787
-------------------------
Classifier: random_forest
Accuracy: 0.6485884241336984
Precision (Macro): 0.5338883666988832
Recall (Macro): 0.4741140066981724
F1 Score (Macro): 0.4505995440003511
-------------------------
Classifier: svm
Accuracy: 0.7647618475377893
Precision (Macro): 0.5761224022841114
Recall (Macro): 0.568636253886672
F1 Score (Macro): 0.5377405854249574
-------------------------
Classifier: perceptron
Accuracy: 0.7139490075907788
Precision (Macro): 0.577117976432924
Recall (Macro): 0.5707040786038377
F1 Score (Macro): 0.570952068

Grid search

In [11]:
# best_estimators = {}

# # Apply grid search to each classifier using word2vec features
# for clf_name, clf in classifiers.items():
#     print(f"Grid search for {clf_name}")
#     param_grid = param_grids[clf_name]
#     if clf_name == 'xgb':
#         grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=3)
#         grid_search.fit(X_train_w2v, y_train_mapped)
#     else:
#         grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=3)
#         grid_search.fit(X_train_w2v, y_train)
#     best_estimators[clf_name] = grid_search.best_estimator_
#     print(f"Best parameters for {clf_name}: {grid_search.best_params_}")
#     print(f"Best accuracy for {clf_name}: {grid_search.best_score_}")
#     print("---------------------------------------------")


In [12]:
best_classifiers = {
    'xgb': XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=200),
    'gaussian_nb': GaussianNB(),
    'decision_tree': DecisionTreeClassifier(max_depth=10),
    'random_forest': RandomForestClassifier(max_depth=10, n_estimators=200),
    'svm': SVC(C=1.0, kernel='rbf'),
    'perceptron': Perceptron(tol=1e-3, random_state=0, alpha=0.0001, penalty=None),
    'logistic_regression': LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=1000)
}

In [15]:
from sklearn.metrics import accuracy_score

results = []

# Loop through each feature set
for feature_key, feature_name in feature_set.items():

    X_train, X_test, y_train, y_test = train_test_split(features[feature_key], y, test_size=0.20, random_state=42)

    # Oversample to balance the classes
    oversampler = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
    
    # change -1 to 2 because of xbg
    y_train_resampled_series = pd.Series(y_train_resampled)
    y_test_series = pd.Series(y_test)
    y_train_resampled_mapped = y_train_resampled_series.map({-1: 0, 0: 1, 1: 2})
    y_test_mapped = y_test_series.map({-1: 0, 0: 1, 1: 2})


    print(f"Using {feature_name} features:")
    
    # Loop through each classifier
    for clf_name, clf in best_classifiers.items():
        print(f"Training {clf_name} with {feature_name} features...")   
        
        # Convert sparse to dense if necessary
        if (feature_key != 4 and feature_key != 6 and feature_key != 7) and clf_name in ['gaussian_nb', 'multinomial_nb', 'perceptron']:
            X_train_dense = X_train_resampled.toarray()
            X_test_dense = X_test.toarray()
            clf.fit(X_train_dense, y_train_resampled)
            y_pred = clf.predict(X_test_dense)
        else:
            if clf_name == 'xgb':
                clf.fit(X_train_resampled, y_train_resampled_mapped)
            else:
                clf.fit(X_train_resampled, y_train_resampled)

            y_pred = clf.predict(X_test)
        
        # Evaluate
        if clf_name == 'xgb':
            acc = accuracy_score(y_test_mapped, y_pred)
            precision = precision_score(y_test_mapped, y_pred, average='macro')
            recall = recall_score(y_test_mapped, y_pred, average='macro')
            f1 = f1_score(y_test_mapped, y_pred, average='macro')
        else:
            acc = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')
        
        print(f"Accuracy of {clf_name} with {feature_name} features: {acc}")
        print(f"Precision of {clf_name} with {feature_name} features: {precision}")
        print(f"Recall of {clf_name} with {feature_name} features: {recall}")
        print(f"F1 Score of {clf_name} with {feature_name} features: {f1}")
            
    # Append the results to the list
        results.append({
            'feature_set': feature_name,
            'classifier': clf_name,
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        })
        print("--------------------------------------")

    print("-------------------------------------------------------------------")


Using bag_of_words features:
Training xgb with bag_of_words features...
Accuracy of xgb with bag_of_words features: 0.7745936698032506
Precision of xgb with bag_of_words features: 0.6817569352091645
Recall of xgb with bag_of_words features: 0.6981402374889135
F1 Score of xgb with bag_of_words features: 0.6863947300028698
--------------------------------------
Training gaussian_nb with bag_of_words features...
Accuracy of gaussian_nb with bag_of_words features: 0.5209580838323353
Precision of gaussian_nb with bag_of_words features: 0.4774245053273527
Recall of gaussian_nb with bag_of_words features: 0.45666741721028226
F1 Score of gaussian_nb with bag_of_words features: 0.4544013347189526
--------------------------------------
Training decision_tree with bag_of_words features...
Accuracy of decision_tree with bag_of_words features: 0.6407185628742516
Precision of decision_tree with bag_of_words features: 0.5690719819342406
Recall of decision_tree with bag_of_words features: 0.5810768926

In [16]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv('classification_results.csv', index=False)