In [None]:
#Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import os
import json

### Loading Dataset

In [3]:
# Loading the final dataset with PCA reduced and embeddings
df = pd.read_pickle("/Users/diegolemos/Masters/Theses/code/data/processed/final_model_dataset.pkl")

# Converting 'embedding' column to 2D NumPy array
X = np.vstack(df['embedding'].values)

# Extracting the target labels
y = df['final_sentiment'].values

# Previewing shapes
print("Feature shape:", X.shape)
print("Labels shape:", y.shape)

# Checking rows
print("\nExample label:", y[0])
print("Example vector (first 5 values):", X[0][:5])

Feature shape: (1345727, 768)
Labels shape: (1345727,)

Example label: neutral
Example vector (first 5 values): [-0.35439885 -0.23273325  0.01678974 -0.5580163   0.07386743]


### Train and Tests Splits

In [5]:
# Train test split 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)

Train set size: (1076581, 768)
Test set size: (269146, 768)


In [6]:
# Train test split 70/30
X_train_70, X_test_30, y_train_70, y_test_30 = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
print("Train set size:", X_train_70.shape)
print("Test set size:", X_test_30.shape)

Train set size: (942008, 768)
Test set size: (403719, 768)


In [9]:
# Train test split 75/25
X_train_75, X_test_25, y_train_75, y_test_25 = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)
print("Train size:", X_train_75.shape)
print("Test size:", X_test_25.shape)

Train size: (1009295, 768)
Test size: (336432, 768)


### Support Verctor Machines (SVM)

In [None]:
#Standardizing features for 80/20 split
scaler_80 = StandardScaler()
X_train_scaled = scaler_80.fit_transform(X_train)
X_test_scaled = scaler_80.transform(X_test)

# Train SVM using SGD Classifier
svm_80 = SGDClassifier( loss='hinge', class_weight='balanced', max_iter=1000, tol=1e-3, random_state=42)

# Fitting the model
svm_80.fit(X_train_scaled, y_train)

# Predicting on test set
y_pred_svm_80 = svm_80.predict(X_test_scaled)

# Evaluating
report_svm = classification_report(y_test, y_pred_svm_80, output_dict=True)
print(classification_report(y_test, y_pred_svm_80))



              precision    recall  f1-score   support

    negative       0.57      0.76      0.65    153371
     neutral       0.25      0.14      0.18     65563
    positive       0.20      0.12      0.15     50212

    accuracy                           0.49    269146
   macro avg       0.34      0.34      0.33    269146
weighted avg       0.42      0.49      0.44    269146



In [None]:
# Standardizing features for 70/30 split
scaler_70 = StandardScaler()
X_train_70_scaled = scaler_70.fit_transform(X_train_70)
X_test_30_scaled = scaler_70.transform(X_test_30)

# Train SVM using SGD Classifier
svm_70 = SGDClassifier( loss='hinge', class_weight='balanced', max_iter=1000, tol=1e-3, random_state=42)

# Fitting the model
svm_70.fit(X_train_70_scaled, y_train_70)

# Predicting on test set
y_pred_svm_70 = svm_70.predict(X_test_30_scaled)

# Evaluating
report_svm_70 = classification_report(y_test_30, y_pred_svm_70, output_dict=True)
print(classification_report(y_test_30, y_pred_svm_70))


              precision    recall  f1-score   support

    negative       0.57      0.62      0.59    230057
     neutral       0.25      0.12      0.17     98344
    positive       0.18      0.26      0.21     75318

    accuracy                           0.43    403719
   macro avg       0.33      0.33      0.32    403719
weighted avg       0.42      0.43      0.42    403719



In [10]:
# Standardize features for 75/25 split
scaler_75 = StandardScaler()
X_train_75_scaled = scaler_75.fit_transform(X_train_75)
X_test_25_scaled = scaler_75.transform(X_test_25)

# Train SVM using SGD Classifier
svm_75 = SGDClassifier(loss='hinge', class_weight='balanced', max_iter=1000, tol=1e-3, random_state=42)

# Fitting the model
svm_75.fit(X_train_75_scaled, y_train_75)

# Predict
y_pred_svm_75 = svm_75.predict(X_test_25_scaled)

# Evaluating
report_svm_75 = classification_report(y_test_25, y_pred_svm_75, output_dict=True)
print(classification_report(y_test_25, y_pred_svm_75))


              precision    recall  f1-score   support

    negative       0.57      0.82      0.67    191714
     neutral       0.24      0.07      0.10     81953
    positive       0.18      0.11      0.14     62765

    accuracy                           0.51    336432
   macro avg       0.33      0.33      0.30    336432
weighted avg       0.42      0.51      0.43    336432



In [None]:
# Saving classification report
with open("/Users/diegolemos/Masters/Theses/code/results/sgd_svm_results_80_20.json", "w") as f:
    json.dump(report_svm, f, indent=4)

# Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test, y_pred_svm, labels=labels)

# Saving confusion matrix as image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("SGD SVM Confusion Matrix (80/20 Split)")
plt.savefig("/Users/diegolemos/Masters/Theses/code/results/sgd_svm_confusion_matrix_80_20.png")
plt.close()

# Saving confusion matrix as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("/Users/diegolemos/Masters/Theses/code/results/sgd_svm_confusion_matrix_80_20.csv")

print("SVM results saved.")

SVM results saved.


In [26]:
# Save classification report
with open("../results/sgd_svm_results_70_30.json", "w") as f:
    json.dump(report_svm_70, f, indent=4)

# Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test_30, y_pred_svm_70, labels=labels)

# Saving confusion matrix as image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("SGD SVM Confusion Matrix (70/30 Split)")
plt.savefig("../results/sgd_svm_confusion_matrix_70_30.png")
plt.close()

# Saving confusion matrix as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/sgd_svm_confusion_matrix_70_30.csv")

print("70/30 SVM results saved.")

70/30 SVM results saved.


In [11]:
# Save classification report
with open("../results/sgd_svm_results_75_25.json", "w") as f:
    json.dump(report_svm_75, f, indent=4)

# Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test_25, y_pred_svm_75, labels=labels)

# Saving confusion matrix as image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("SGD SVM Confusion Matrix (75/25 Split)")
plt.savefig("../results/sgd_svm_confusion_matrix_75_25.png")
plt.close()

# Saving confusion matrix as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/sgd_svm_confusion_matrix_75_25.csv")

print("75/25 SVM results saved.")


75/25 SVM results saved.


### K-fold Cross-Validation (SVM)

In [None]:
#Setting up 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#Storagie
svm_fold_reports = []
fold = 1

for train_index, test_index in skf.split(X, y):
    print(f"\n Fold {fold}")
    
    X_train_cv, X_test_cv = X[train_index], X[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]
    
    #Standardizing features
    scaler = StandardScaler()
    X_train_cv_scaled = scaler.fit_transform(X_train_cv)
    X_test_cv_scaled = scaler.transform(X_test_cv)
    
    #SVM
    svm_cv = SGDClassifier(loss='hinge', class_weight='balanced', max_iter=1000, tol=1e-3, random_state=42)
    
    #Training
    svm_cv.fit(X_train_cv_scaled, y_train_cv)
    y_pred_cv = svm_cv.predict(X_test_cv_scaled)
    
    #Classification report
    report = classification_report(y_test_cv, y_pred_cv, output_dict=True)
    svm_fold_reports.append(report)
    
    print(classification_report(y_test_cv, y_pred_cv))
    fold += 1



 Fold 1
              precision    recall  f1-score   support

    negative       0.57      0.54      0.55    153371
     neutral       0.24      0.33      0.28     65563
    positive       0.20      0.13      0.16     50212

    accuracy                           0.41    269146
   macro avg       0.34      0.33      0.33    269146
weighted avg       0.42      0.41      0.41    269146


 Fold 2
              precision    recall  f1-score   support

    negative       0.57      0.76      0.65    153371
     neutral       0.24      0.09      0.13     65562
    positive       0.18      0.15      0.16     50213

    accuracy                           0.48    269146
   macro avg       0.33      0.33      0.31    269146
weighted avg       0.42      0.48      0.43    269146


 Fold 3
              precision    recall  f1-score   support

    negative       0.57      0.77      0.66    153371
     neutral       0.24      0.22      0.23     65562
    positive       0.19      0.01      0.02     

In [5]:
#Extracting F1, precision, recall for each class across folds
labels = ["negative", "neutral", "positive"]
metrics = ["precision", "recall", "f1-score"]

#Initializing dictionary
avg_results = {label: {metric: [] for metric in metrics} for label in labels}

#Accumulating metrics across folds
for report in svm_fold_reports:
    for label in labels:
        for metric in metrics:
            avg_results[label][metric].append(report[label][metric])

#Computing averages
avg_results_final = {
    label: {
        metric: round(np.mean(avg_results[label][metric]), 4)
        for metric in metrics
    }
    for label in labels
}

#Adding overall averages
macro_f1 = np.mean([avg_results_final[label]['f1-score'] for label in labels])
weighted_f1s = [report["weighted avg"]["f1-score"] for report in svm_fold_reports]

avg_results_final["macro avg F1"] = round(macro_f1, 4)
avg_results_final["weighted avg F1"] = round(np.mean(weighted_f1s), 4)

#Saving as JSON
with open("../results/sgd_svm_results_5fold.json", "w") as f:
    json.dump(avg_results_final, f, indent=4)

# Display summary
print("5-Fold SVM average results saved.\n")
print(json.dumps(avg_results_final, indent=4))

5-Fold SVM average results saved.

{
    "negative": {
        "precision": 0.5705,
        "recall": 0.7027,
        "f1-score": 0.6273
    },
    "neutral": {
        "precision": 0.2451,
        "recall": 0.1719,
        "f1-score": 0.1873
    },
    "positive": {
        "precision": 0.1883,
        "recall": 0.1277,
        "f1-score": 0.1404
    },
    "macro avg F1": 0.3183,
    "weighted avg F1": 0.4293
}


#### SVM Performance
On PCA-reduced BERT embeddings, the SVM model was fine-tuned with 5 fold cross validation. It was seen that although the model could indeed pick up some signal, it performed much worse than it should have. This implies that the dimensionality reduction might have eliminated feature required to correctly label them. Still, the SVM offered a practical point of reference among the conventional classifiers.

### Logistic Regression

In [None]:
#Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Training Logistic Regression using SGD 80/20
logreg_model = SGDClassifier(loss="log_loss", class_weight="balanced", max_iter=1000, tol=1e-3, random_state=42)

# Training
logreg_model.fit(X_train_scaled, y_train)

#Predicting
y_pred_logreg = logreg_model.predict(X_test_scaled)

# Evaluating
report_logreg = classification_report(y_test, y_pred_logreg, output_dict=True)
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

    negative       0.57      0.91      0.70    153371
     neutral       0.25      0.05      0.08     65563
    positive       0.19      0.04      0.07     50212

    accuracy                           0.54    269146
   macro avg       0.34      0.33      0.28    269146
weighted avg       0.42      0.54      0.43    269146



In [12]:
# Train Logistic Regression using SGD 70/30
logreg_70 = SGDClassifier(loss="log_loss", class_weight="balanced", max_iter=1000, tol=1e-3, random_state=42)

# Training
logreg_70.fit(X_train_70_scaled, y_train_70)

# Predicting
y_pred_logreg_70 = logreg_70.predict(X_test_30_scaled)

# Evaluating
report_logreg_70 = classification_report(y_test_30, y_pred_logreg_70, output_dict=True)
print(classification_report(y_test_30, y_pred_logreg_70))

NameError: name 'X_train_70_scaled' is not defined

In [13]:
# Train Logistic Regression using SGD
logreg_75 = SGDClassifier(loss="log_loss", class_weight="balanced", max_iter=1000, tol=1e-3, random_state=42)

# Training
logreg_75.fit(X_train_75_scaled, y_train_75)

# Predict
y_pred_logreg_75 = logreg_75.predict(X_test_25_scaled)

# Evaluating
report_logreg_75 = classification_report(y_test_25, y_pred_logreg_75, output_dict=True)
print(classification_report(y_test_25, y_pred_logreg_75))


              precision    recall  f1-score   support

    negative       0.57      0.83      0.67    191714
     neutral       0.24      0.06      0.10     81953
    positive       0.18      0.11      0.14     62765

    accuracy                           0.51    336432
   macro avg       0.33      0.33      0.30    336432
weighted avg       0.42      0.51      0.43    336432



In [17]:
# Saving classification report
with open("../results/logreg_results_80_20.json", "w") as f:
    json.dump(report_logreg, f, indent=4)

# Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test, y_pred_logreg, labels=labels)

# Saving confusion matrix as image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Logistic Regression Confusion Matrix (80/20 Split)")
plt.savefig("../results/logreg_confusion_matrix_80_20.png")
plt.close()

# Saving confusion matrix as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/logreg_confusion_matrix_80_20.csv")

print("Logistic Regression results saved.")


Logistic Regression results saved.


In [28]:
# Saving classification report
with open("../results/logreg_results_70_30.json", "w") as f:
    json.dump(report_logreg_70, f, indent=4)

# Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test_30, y_pred_logreg_70, labels=labels)

# Saving confusion matrix as image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Logistic Regression Confusion Matrix (70/30 Split)")
plt.savefig("../results/logreg_confusion_matrix_70_30.png")
plt.close()

# Saving confusion matrix as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/logreg_confusion_matrix_70_30.csv")

print("70/30 Logistic Regression results saved.")

70/30 Logistic Regression results saved.


In [None]:
# Save classification report
with open("../results/logreg_results_75_25.json", "w") as f:
    json.dump(report_logreg_75, f, indent=4)

# Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test_25, y_pred_logreg_75, labels=labels)

# Saving confusion matrix as image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Logistic Regression Confusion Matrix (75/25 Split)")
plt.savefig("../results/logreg_confusion_matrix_75_25.png")
plt.close()

# Saving confusion matrix as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/logreg_confusion_matrix_75_25.csv")

print("75/25 Logistic Regression results saved.")

75/25 Logistic Regression results saved.


### K-fold Cross-Validation (LR)

In [3]:
#Setting up 5-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

logreg_fold_reports = []
fold = 1

for train_index, test_index in skf.split(X, y):
    print(f"\n Fold {fold}")
    
    X_train_cv, X_test_cv = X[train_index], X[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]
    
    #Scaling features
    scaler = StandardScaler()
    X_train_cv_scaled = scaler.fit_transform(X_train_cv)
    X_test_cv_scaled = scaler.transform(X_test_cv)
    
    #Logistic Regression model
    logreg_cv = SGDClassifier(loss="log_loss", class_weight="balanced", max_iter=1000, tol=1e-3, random_state=42)
    
    #Training
    logreg_cv.fit(X_train_cv_scaled, y_train_cv)
    y_pred_cv = logreg_cv.predict(X_test_cv_scaled)
    
    #Storing report
    report = classification_report(y_test_cv, y_pred_cv, output_dict=True)
    logreg_fold_reports.append(report)
    
    print(classification_report(y_test_cv, y_pred_cv))
    fold += 1


 Fold 1
              precision    recall  f1-score   support

    negative       0.57      0.88      0.69    153371
     neutral       0.25      0.08      0.12     65563
    positive       0.19      0.05      0.08     50212

    accuracy                           0.53    269146
   macro avg       0.34      0.33      0.30    269146
weighted avg       0.42      0.53      0.44    269146


 Fold 2
              precision    recall  f1-score   support

    negative       0.57      0.86      0.69    153371
     neutral       0.24      0.10      0.14     65562
    positive       0.19      0.04      0.07     50213

    accuracy                           0.52    269146
   macro avg       0.33      0.33      0.30    269146
weighted avg       0.42      0.52      0.44    269146


 Fold 3
              precision    recall  f1-score   support

    negative       0.57      0.87      0.69    153371
     neutral       0.24      0.03      0.05     65562
    positive       0.19      0.10      0.13     

In [4]:
#Labels and metrics
labels = ["negative", "neutral", "positive"]
metrics = ["precision", "recall", "f1-score"]

#Initializing dictionary
avg_logreg_results = {label: {metric: [] for metric in metrics} for label in labels}

#Collecting metrics from each fold
for report in logreg_fold_reports:
    for label in labels:
        for metric in metrics:
            avg_logreg_results[label][metric].append(report[label][metric])

#Computing average per label/metric
avg_logreg_final = {
    label: {
        metric: round(np.mean(avg_logreg_results[label][metric]), 4)
        for metric in metrics
    }
    for label in labels
}

#Adding macro and weighted avg
macro_f1 = np.mean([avg_logreg_final[label]['f1-score'] for label in labels])
weighted_f1s = [report["weighted avg"]["f1-score"] for report in logreg_fold_reports]

avg_logreg_final["macro avg F1"] = round(macro_f1, 4)
avg_logreg_final["weighted avg F1"] = round(np.mean(weighted_f1s), 4)

# Saving to JSON
with open("../results/logreg_results_5fold.json", "w") as f:
    json.dump(avg_logreg_final, f, indent=4)

# Final summary
print("5-Fold Logistic Regression results saved.\n")
print(json.dumps(avg_logreg_final, indent=4))

5-Fold Logistic Regression results saved.

{
    "negative": {
        "precision": 0.5708,
        "recall": 0.8523,
        "f1-score": 0.6834
    },
    "neutral": {
        "precision": 0.2444,
        "recall": 0.074,
        "f1-score": 0.1048
    },
    "positive": {
        "precision": 0.1923,
        "recall": 0.0779,
        "f1-score": 0.1072
    },
    "macro avg F1": 0.2985,
    "weighted avg F1": 0.435
}


#### LR Performance
Logistic Regression also did not do so well with the PCA-reduced embeddings. The model was simple and the decision boundary of the model is linear, hence, it might have hindered its performance to represent non-linear patterns in the feature space. The findings support the suspicion that flattening out and dimension reduction of BERT embedding can eliminate precious semantic information.

### Random Forest

In [None]:
# Random Forest 80/20
rf_model = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1)

#Training
rf_model.fit(X_train, y_train)

#Predicting
y_pred_rf = rf_model.predict(X_test)

#Evaluating
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

    negative       0.57      1.00      0.73    153371
     neutral       0.33      0.00      0.00     65563
    positive       0.21      0.00      0.00     50212

    accuracy                           0.57    269146
   macro avg       0.37      0.33      0.24    269146
weighted avg       0.45      0.57      0.41    269146



In [None]:
#Random Forest 70/30
rf_70 = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42,n_jobs=-1)

#Training
rf_70.fit(X_train_70, y_train_70)

#Predicting
y_pred_rf_70 = rf_70.predict(X_test_30)

#Evaluating
report_rf_70 = classification_report(y_test_30, y_pred_rf_70, output_dict=True)
print(classification_report(y_test_30, y_pred_rf_70))

              precision    recall  f1-score   support

    negative       0.57      1.00      0.73    230057
     neutral       0.37      0.00      0.00     98344
    positive       0.24      0.00      0.00     75318

    accuracy                           0.57    403719
   macro avg       0.40      0.33      0.24    403719
weighted avg       0.46      0.57      0.41    403719



In [15]:
#Random Forest 75/25 split 
rf_75 = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1)

#Training
rf_75.fit(X_train_75, y_train_75)

#Predicting
y_pred_rf_75 = rf_75.predict(X_test_25)

#Evaluating
report_rf_75 = classification_report(y_test_25, y_pred_rf_75, output_dict=True)
print(classification_report(y_test_25, y_pred_rf_75))

              precision    recall  f1-score   support

    negative       0.57      1.00      0.73    191714
     neutral       0.31      0.00      0.00     81953
    positive       0.19      0.00      0.00     62765

    accuracy                           0.57    336432
   macro avg       0.36      0.33      0.24    336432
weighted avg       0.44      0.57      0.41    336432



In [None]:
#Saving classification report
with open("../results/rf_results_80_20.json", "w") as f:
    json.dump(report_rf, f, indent=4)

#Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test, y_pred_rf, labels=labels)

#Saving confusion matrix image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Random Forest Confusion Matrix (80/20 Split)")
plt.savefig("../results/rf_confusion_matrix_80_20.png")
plt.close()

#Saving as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/rf_confusion_matrix_80_20.csv")

print("Random Forest results saved.")

Random Forest results saved.


In [None]:
#Saving classification report
with open("../results/rf_results_70_30.json", "w") as f:
    json.dump(report_rf_70, f, indent=4)

#Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test_30, y_pred_rf_70, labels=labels)

#Saving confusion matrix as image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Random Forest Confusion Matrix (70/30 Split)")
plt.savefig("../results/rf_confusion_matrix_70_30.png")
plt.close()

#Saving confusion matrix as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/rf_confusion_matrix_70_30.csv")

print("70/30 Random Forest results saved.")

70/30 Random Forest results saved.


In [16]:
#Saving classification report
with open("../results/rf_results_75_25.json", "w") as f:
    json.dump(report_rf_75, f, indent=4)

# Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test_25, y_pred_rf_75, labels=labels)

# Saving confusion matrix as image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Random Forest Confusion Matrix (75/25 Split)")
plt.savefig("../results/rf_confusion_matrix_75_25.png")
plt.close()

# Saving confusion matrix as CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/rf_confusion_matrix_75_25.csv")

print("75/25 Random Forest results saved.")

75/25 Random Forest results saved.


### K-fold Cross-Validation (RF)

In [4]:
#Setting up 5-fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_fold_reports = []
fold = 1

for train_index, test_index in skf.split(X, y):
    print(f"\n Fold {fold}")
    
    X_train_cv, X_test_cv = X[train_index], X[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]

    #Random Forest
    rf_cv = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1)
    
    # Training
    rf_cv.fit(X_train_cv, y_train_cv)
    y_pred_cv = rf_cv.predict(X_test_cv)

    #Collecting results
    report = classification_report(y_test_cv, y_pred_cv, output_dict=True)
    rf_fold_reports.append(report)
    
    print(classification_report(y_test_cv, y_pred_cv))
    fold += 1


 Fold 1
              precision    recall  f1-score   support

    negative       0.57      1.00      0.73    153371
     neutral       0.25      0.00      0.00     65563
    positive       0.14      0.00      0.00     50212

    accuracy                           0.57    269146
   macro avg       0.32      0.33      0.24    269146
weighted avg       0.41      0.57      0.41    269146


 Fold 2
              precision    recall  f1-score   support

    negative       0.57      1.00      0.73    153371
     neutral       0.29      0.00      0.00     65562
    positive       0.18      0.00      0.00     50213

    accuracy                           0.57    269146
   macro avg       0.35      0.33      0.24    269146
weighted avg       0.43      0.57      0.41    269146


 Fold 3
              precision    recall  f1-score   support

    negative       0.57      1.00      0.73    153371
     neutral       0.23      0.00      0.00     65562
    positive       0.28      0.00      0.00     

In [6]:
#Labels and metrics
labels = ["negative", "neutral", "positive"]
metrics = ["precision", "recall", "f1-score"]

#Initializing aggregation structure
avg_rf_results = {label: {metric: [] for metric in metrics} for label in labels}

#Collecting fold-wise results
for report in rf_fold_reports:
    for label in labels:
        for metric in metrics:
            avg_rf_results[label][metric].append(report[label][metric])

#Computing averages
avg_rf_final = {
    label: {
        metric: round(np.mean(avg_rf_results[label][metric]), 4)
        for metric in metrics
    }
    for label in labels
}

# Adding macro and weighted F1
macro_f1 = np.mean([avg_rf_final[label]["f1-score"] for label in labels])
weighted_f1s = [report["weighted avg"]["f1-score"] for report in rf_fold_reports]

avg_rf_final["macro avg F1"] = round(macro_f1, 4)
avg_rf_final["weighted avg F1"] = round(np.mean(weighted_f1s), 4)

#Saving to JSON
with open("../results/rf_results_5fold.json", "w") as f:
    json.dump(avg_rf_final, f, indent=4)

#Summary
print("5-Fold Random Forest results saved.\n")
print(json.dumps(avg_rf_final, indent=4))

5-Fold Random Forest results saved.

{
    "negative": {
        "precision": 0.5699,
        "recall": 0.9995,
        "f1-score": 0.7259
    },
    "neutral": {
        "precision": 0.2596,
        "recall": 0.0004,
        "f1-score": 0.0008
    },
    "positive": {
        "precision": 0.2296,
        "recall": 0.0002,
        "f1-score": 0.0003
    },
    "macro avg F1": 0.2423,
    "weighted avg F1": 0.4139
}


#### RF Performance
The Random Forest classifier performed slightly better than SVM and Logistic Regression because the latter two classifiers are not best to represent non-linear patterns. Nevertheless, the results were quite modest compared to those taking place with high dimensional BERT vectors. It is a confirmation that tree-based methods are able to partially over compensate on physician information being lost, however, they are yet affected by PCA information loss.

### Conclusion
According to all three common models the SVM, the Logistic Regression, and the Random Forest that were tested, there was a consistent decrease in performance when the reduced BERT embeddings were created using the PCA. Though PCA has been used to render the data manageable to these models it seems to have sacrificed the richness of original feature space. This experiment pointed out the trade-offs between a model-sufficiency and a computation-efficiency, and validated that although these old models are the useful comparative baselines, they are not optimally designed to such kind of representation of the data. Such observations aided the reasoning that we should further the use of the BiLSTM model that could directly operate on raw sequential data.