In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

## Training classifiers on ModernBERT embeddings

#### Importing the BERT embedded data

In [15]:
import pandas as pd

# Load the datasets
X_train = pd.read_csv("../Preprocessing/X_train.csv").values
y_train = pd.read_csv("../Preprocessing/y_train.csv").values.ravel()  # Flatten the labels
X_valid = pd.read_csv("../Preprocessing/X_valid.csv").values
y_valid = pd.read_csv("../Preprocessing/y_valid.csv").values.ravel()
X_test = pd.read_csv("../Preprocessing/X_test.csv").values
y_test = pd.read_csv("../Preprocessing/y_test.csv").values.ravel()

# Verify the shapes
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (10053, 768), y_train shape: (10053,)
X_valid shape: (2277, 768), y_valid shape: (2277,)
X_test shape: (2277, 768), y_test shape: (2277,)


### Logistical Regression

In [16]:
# Define pipeline
LR_pipeline = Pipeline([
    ('logreg', LogisticRegression()),
])


# Define parameters for grid search
LR_parameters = {
    'logreg__C': [0.1, 1.0, 10.0],
    'logreg__max_iter': [10000],
    'logreg__solver': ['saga'],
}


# Create grid search object
LR_grid_search = GridSearchCV(LR_pipeline, LR_parameters, scoring='f1_weighted', cv=5, n_jobs=-1)


# Fit the grid search to the training data
#grid_search.fit(X_train, y_train)
LR_grid_search.fit(X_train, y_train)

In [17]:
best_LR_model = LR_grid_search.best_estimator_
LR_y_pred = best_LR_model.predict(X_valid)

# Print best parameters and score
print("Best parameters: ", LR_grid_search.best_params_)
print("Best accuracy score: ", LR_grid_search.best_score_)

Best parameters:  {'logreg__C': 0.1, 'logreg__max_iter': 10000, 'logreg__solver': 'saga'}
Best accuracy score:  0.6963365544327278


In [18]:
print("Performance on the training set:")
print(classification_report(y_train, best_LR_model.predict(X_train)))

print("Performance on the validation set:")
print(classification_report(y_valid, LR_y_pred))

Performance on the training set:
              precision    recall  f1-score   support

    Negative       0.79      0.78      0.78      2933
     Neutral       0.68      0.68      0.68      3560
    Positive       0.80      0.81      0.80      3560

    accuracy                           0.75     10053
   macro avg       0.76      0.76      0.76     10053
weighted avg       0.75      0.75      0.75     10053

Performance on the validation set:
              precision    recall  f1-score   support

    Negative       0.61      0.75      0.67       170
     Neutral       0.26      0.65      0.37       214
    Positive       0.97      0.78      0.86      1893

    accuracy                           0.77      2277
   macro avg       0.61      0.73      0.64      2277
weighted avg       0.87      0.77      0.80      2277



In [19]:
print("Performance on the test set:")
print(classification_report(y_test, best_LR_model.predict(X_test)))

Performance on the test set:
              precision    recall  f1-score   support

    Negative       0.51      0.71      0.59       160
     Neutral       0.23      0.61      0.33       208
    Positive       0.97      0.76      0.85      1909

    accuracy                           0.74      2277
   macro avg       0.57      0.69      0.59      2277
weighted avg       0.87      0.74      0.79      2277



### Random Forest

In [20]:
# Define pipeline
RF_pipeline = Pipeline([
    ('rf', RandomForestClassifier()),
])


# Define parameters for grid search
RF_parameters = {
   'rf__criterion': ['gini', 'entropy', 'log_loss'],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10,20], #[10, 20, 30],
    'rf__min_samples_leaf':[5],
}


# Create grid search object
RF_grid_search = GridSearchCV(RF_pipeline, RF_parameters, scoring='f1_weighted', cv=5, n_jobs=-1)


# Fit the grid search to the training data
#grid_search.fit(X_train, y_train)
RF_grid_search.fit(X_train, y_train)

In [21]:
best_RF_model = RF_grid_search.best_estimator_
RF_y_pred = best_RF_model.predict(X_valid)

# Print best parameters and score
print("Best parameters: ", RF_grid_search.best_params_)
print("Best accuracy score: ", RF_grid_search.best_score_)

Best parameters:  {'rf__criterion': 'gini', 'rf__max_depth': 20, 'rf__min_samples_leaf': 5, 'rf__n_estimators': 200}
Best accuracy score:  0.5796808914762831


In [22]:
print("Performance on the training set:")
print(classification_report(y_train, best_RF_model.predict(X_train)))

print("Performance on the validation set:")
print(classification_report(y_valid, RF_y_pred))

Performance on the training set:
              precision    recall  f1-score   support

    Negative       1.00      0.99      1.00      2933
     Neutral       1.00      1.00      1.00      3560
    Positive       0.99      1.00      1.00      3560

    accuracy                           1.00     10053
   macro avg       1.00      1.00      1.00     10053
weighted avg       1.00      1.00      1.00     10053

Performance on the validation set:
              precision    recall  f1-score   support

    Negative       0.36      0.52      0.43       170
     Neutral       0.18      0.59      0.27       214
    Positive       0.94      0.66      0.77      1893

    accuracy                           0.64      2277
   macro avg       0.49      0.59      0.49      2277
weighted avg       0.83      0.64      0.70      2277



In [23]:
print("Performance on the test set:")
print(classification_report(y_test, best_RF_model.predict(X_test)))

Performance on the test set:
              precision    recall  f1-score   support

    Negative       0.35      0.59      0.44       160
     Neutral       0.17      0.54      0.26       208
    Positive       0.94      0.67      0.78      1909

    accuracy                           0.65      2277
   macro avg       0.49      0.60      0.49      2277
weighted avg       0.83      0.65      0.71      2277



### MLP

In [24]:
# Define pipeline
MLP_pipeline = Pipeline([ 
    ('svd', TruncatedSVD(n_components=100)),  # Dimensionality reduction
    ('scaler', StandardScaler(with_mean=False)),  # Scaling
    ('mlp', MLPClassifier()),  # Classifier
])

In [25]:
# Define parameters for grid search
MLP_parameters = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (150,)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam', 'sgd'],
    'mlp__alpha': [0.0001, 0.001,0.01],  # Regularization strength
}

In [26]:
MLP_grid_search = GridSearchCV(MLP_pipeline, MLP_parameters, scoring='f1_weighted', cv=5, n_jobs=-1)

# Fit the grid search to the training data
MLP_grid_search.fit(X_train, y_train)



In [27]:
best_MLP_model = MLP_grid_search.best_estimator_
MLP_y_pred = best_MLP_model.predict(X_valid)

# Print best parameters and score
print("Best parameters: ", MLP_grid_search.best_params_)
print("Best accuracy score: ", MLP_grid_search.best_score_)

Best parameters:  {'mlp__activation': 'relu', 'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (50,), 'mlp__solver': 'sgd'}
Best accuracy score:  0.6524270540913601


In [28]:
print("Performance on the training set:")
print(classification_report(y_train, best_MLP_model.predict(X_train)))

print("Performance on the validation set:")
print(classification_report(y_valid, MLP_y_pred))

Performance on the training set:
              precision    recall  f1-score   support

    Negative       0.74      0.65      0.69      2933
     Neutral       0.58      0.64      0.61      3560
    Positive       0.74      0.74      0.74      3560

    accuracy                           0.68     10053
   macro avg       0.69      0.68      0.68     10053
weighted avg       0.68      0.68      0.68     10053

Performance on the validation set:
              precision    recall  f1-score   support

    Negative       0.54      0.68      0.60       170
     Neutral       0.22      0.64      0.32       214
    Positive       0.96      0.73      0.83      1893

    accuracy                           0.72      2277
   macro avg       0.57      0.68      0.58      2277
weighted avg       0.86      0.72      0.76      2277



In [29]:
print("Performance on the test set:")
print(classification_report(y_test, best_MLP_model.predict(X_test)))

Performance on the test set:
              precision    recall  f1-score   support

    Negative       0.48      0.63      0.54       160
     Neutral       0.20      0.62      0.30       208
    Positive       0.96      0.71      0.82      1909

    accuracy                           0.70      2277
   macro avg       0.55      0.65      0.55      2277
weighted avg       0.86      0.70      0.75      2277

