# Modelling

### Libraries

In [None]:
# import libraries

import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV


### Data

In [None]:
# load review sentiment data

review_df = pd.read_csv('data/review_sentiment.csv')

review_df

### Train/test sets

In [None]:
# select and load feature data

feature_set = {
    0: 'bag_of_words',
    1: 'one_hot',
    2: 'n_grams',
    3: 'tf_idf',
    4: 'word2vec',
    5: 'combined_features'
}
selected = 5

X = None
if selected == 4:
    X = np.load('features/' + feature_set[selected] + '.npy')
else:
    X = sparse.load_npz('features/' + feature_set[selected] + '.npz')
    
X.shape

In [None]:
# target labels

y = review_df['sentiment']

y.shape

In [None]:
# split data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

print("\nLabel distribution in the training set:")
print(y_train.value_counts())

print("\nLabel distribution in the test set:")
print(y_test.value_counts())

In [None]:
# oversampling to balance the classes

oversampler = RandomOverSampler()
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train.to_numpy())

print(X_train_resampled.shape, y_train_resampled.shape)

print("\nLabel distribution after oversampling:")
print(pd.DataFrame(y_train_resampled).value_counts())

### Classifier training

In [None]:
# classifiers

classifiers = {
    'gaussian_nb': GaussianNB(),
    'logistic_regression': LogisticRegression(max_iter=1000, random_state=0),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier(),
    'svm': SVC(),
    'perceptron': Perceptron(tol=1e-3, random_state=0),
    'xgb': XGBClassifier()
}

In [None]:
# train a classifier

selected = 'gaussian_nb'
selected = 'decision_tree'

clf = classifiers[selected]
clf.fit(X_train_resampled, y_train_resampled)

In [None]:
# save classifier model

joblib.dump(clf, 'models/' + selected + '.plk')

In [None]:
# load a classifier model

model = 'gaussian_nb'
clf = joblib.load('models/' + model + '.plk')

In [None]:
# get predictions

y_pred_clf = clf.predict(X_test)
print(y_pred_clf)

In [None]:
# cross-validation

folds = 5
cv_scores = cross_validate(clf, X, y, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=folds)

Grid Search

In [None]:
# Parameter grid for each classifier
param_grids = {
    'gaussian_nb': {},
    'logistic_regression': {
        'C': [0.01, 0.1, 1.0, 10.0],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'decision_tree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'random_forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'svm': {
        'C': [0.1, 1.0, 10.0],
        'kernel': ['linear', 'rbf']
    },
    'perceptron': {
        'alpha': [0.0001, 0.001, 0.01],
        'penalty': [None, 'l2', 'l1', 'elasticnet']
    },
    'xgb': {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [100, 200, 300]
    }
}

best_estimators = {}

# Apply grid search to each classifier
for clf_name, clf in classifiers.items():
    print(f"Grid search for {clf_name}")
    param_grid = param_grids[clf_name]
    grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=5)
    grid_search.fit(X_train, y_train)
    best_estimators[clf_name] = grid_search.best_estimator_
    print(f"Best parameters for {clf_name}: {grid_search.best_params_}")
    print(f"Best accuracy for {clf_name}: {grid_search.best_score_}")
    print("---------------------------------------------")


In [None]:
# VADER sentiment analysis

vader = SentimentIntensityAnalyzer()
res = []
y_pred_vader = []
for review in review_df['text']:
    sentiment = vader.polarity_scores(review)
    res.append(sentiment)
    pred = round(sentiment['compound'])
    y_pred_vader.append(pred)

res

### Model performance

In [None]:
# cross-validation results

cv_scores

In [None]:
def evaluate_model(y_test, y_pred):
    '''
    Evaluate the performance of a multi-label classifier
    '''
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Confusion Matrix:")
    print(cm)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

In [None]:
# performance metrics - classifier

evaluate_model(y_test, y_pred_clf)

In [None]:
# performance metrics - VADER

y_true = review_df['sentiment']
y_true = y_true.values

evaluate_model(y_true, y_pred_vader)

In [None]:
# dataframe with VADER sentiment scores

vader_df = pd.DataFrame(res)
vader_df = vader_df['compound']
vader_df = pd.concat([review_df, vader_df], axis=1)

vader_df

In [None]:
# visualize VADER compound scores by sentiment

negative_scores = vader_df[vader_df['sentiment'] == -1]['compound']
neutral_scores = vader_df[vader_df['sentiment'] == 0]['compound']
positive_scores = vader_df[vader_df['sentiment'] == 1]['compound']
all_scores = pd.concat([negative_scores, neutral_scores, positive_scores])

sentiments = ['negative'] * len(negative_scores) + ['neutral'] * len(neutral_scores) + ['positive'] * len(positive_scores)
sentiment_scores = pd.DataFrame({'sentiment': sentiments, 'compound': all_scores})

sns.barplot(data=sentiment_scores, x='sentiment', y='compound')
plt.xlabel('Sentiment')
plt.ylabel('VADER Compound Score')
plt.title('VADER Compound Score by Sentiment')
plt.show()