**Here we perform the training loops**

In [78]:
from DataFrameManager.dataframeManager import DataFrameManager
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb


DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
MODEL_NAME = 'roberta'
NUM_CLASSES = 3


In [79]:
dataFrameManage = DataFrameManager()

train_df = dataFrameManage.load_dataframe(filepath="Data/train.csv", encoding=DATASET_ENCODING, preprocess=False)
test_df = dataFrameManage.load_dataframe(filepath="Data/test.csv", encoding=DATASET_ENCODING, preprocess=False)

encode_map = {"NEGATIVE" : 0, "NEUTRAL" : 1, "POSITIVE" : 2}
    

train_labels = train_df["target"].map(encode_map).to_list()
test_labels = test_df["target"].map(encode_map).to_list()

In [80]:
train_embeddings = np.load(f'Data/train_embeddings_{MODEL_NAME}.npy', allow_pickle=True)
test_embeddings = np.load(f'Data/test_embeddings_{MODEL_NAME}.npy', allow_pickle=True)

In [81]:
# # # Start training


# # Define the hyperparameter grid
# param_grid = {
#     'C': [10.0, 50.0],
#     'max_iter': [1000, 1500],
#     'penalty': ['l2']
# }

# # # Create the Logistic Regression classifier
# classifier_lr = LogisticRegression(random_state=42)

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(classifier_lr, param_grid, cv=5)
# grid_search.fit(train_embeddings, train_labels)

# # Get the best hyperparameters and best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Hyperparameters: ", best_params)
# print("Best Score: ", best_score)

# # Fit the model with the best hyperparameters on the entire training data
# best_classifier_lr = LogisticRegression(**best_params)
# best_classifier_lr.fit(train_embeddings, train_labels)

# # Predict on the test set
# predictions_lr = best_classifier_lr.predict(test_embeddings)

# # Calculate the accuracy score
# accuracy_lr = accuracy_score(test_labels, predictions_lr)
# print("Accuracy score for Logistic Regression: ", accuracy_lr)



**XGBoost**

In [None]:

# Define the hyperparameter grid
param_grid = {
    'eta': [0.1, 0.3],
    'max_depth': [3, 6],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Create the XGBoost classifier
classifier_xgb = xgb.XGBClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(classifier_xgb, param_grid, cv=5)
grid_search.fit(train_embeddings, train_labels)

# Get the best hyperparameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters: ", best_params)
print("Best Score: ", best_score)

# Fit the model with the best hyperparameters on the entire training data
best_classifier_xgb = xgb.XGBClassifier(**best_params, random_state=42)
best_classifier_xgb.fit(train_embeddings, train_labels)

# Predict on the test set
predictions_xgb = best_classifier_xgb.predict(test_embeddings)

# Calculate the accuracy score
accuracy_xgb = accuracy_score(test_labels, predictions_xgb)
print("Accuracy score for XGBoost: ", accuracy_xgb)

**MLP**

In [83]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network model
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.softmax(self.fc4(x), dim=1)
        return x

# Create the neural network model
model = NeuralNetwork(train_embeddings.shape[1], 3)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert the data to PyTorch tensors
train_embeddings_tensor = torch.tensor(train_embeddings, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)
test_embeddings_tensor = torch.tensor(test_embeddings, dtype=torch.float32)

# Train the model
num_epochs = 20
epoch_loss = []
with tqdm(total=num_epochs, desc='Epoch', unit='epoch') as pbar:
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(train_embeddings_tensor)
        loss = criterion(outputs, train_labels_tensor)
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.item())

        pbar.set_postfix({'Train Loss': epoch_loss[-1]})
        pbar.update()

# Evaluate the model
with torch.no_grad():
    model.eval()
    outputs = model(test_embeddings_tensor)
    _, predicted_labels = torch.max(outputs, 1)


# Convert the tensors to numpy arrays
predicted_labels = predicted_labels.numpy()
true_labels = np.array(test_labels)

# Calculate the accuracy score
accuracy_nn = accuracy_score(true_labels, predicted_labels)
print("Accuracy score for Neural Network: ", accuracy_nn)


Epoch: 100%|██████████| 20/20 [00:00<00:00, 113.45epoch/s, Train Loss=0.963]

tensor([2, 2, 2,  ..., 2, 2, 2])
Accuracy score for Neural Network:  0.5082623935903856



