"FC layers referenced from https://towardsdatascience.com/math-neural-network-from-scratch-in-python-d6da9f29ce65"


In [1]:
import torch
import os
import numpy as np
import pickle
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import plotly.graph_objs as go
from sklearn.manifold import TSNE
import plotly.io as pio
from sklearn.utils import class_weight
import tqdm as notebook_tqdm
from tqdm import tqdm

In [2]:
# !pip install ipywidgets

In [3]:
class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FCLayer, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.fc(x)
        return x

class ActivationLayer(nn.Module):
    def __init__(self, activation_fn):
        super(ActivationLayer, self).__init__()
        self.activation_fn = activation_fn

    def forward(self, x):
        x = self.activation_fn(x)
        return x

def tanh(x):
    return torch.tanh(x)

def sigmoid(x):
    return torch.sigmoid(x)

class MyNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MyNetwork, self).__init__()
        self.fc1 = FCLayer(input_dim, hidden_dim)
        self.activation1 = ActivationLayer(tanh)
        self.fc2 = FCLayer(hidden_dim, output_dim)
        self.activation2 = ActivationLayer(sigmoid)

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation1(x)
        x = self.fc2(x)
        x = self.activation2(x)
        return x

# loss function and its derivative
def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_prime(y_true, y_pred):
    return 2 * (y_pred - y_true) / y_true.size


In [4]:
# Function to balance class distribution using oversampling
def oversample_data(X_train, Y_train):
    # Determine the class with the maximum number of instances
    max_class_count = np.max(np.bincount(Y_train))
    # Generate indices for oversampling each class
    indices_list = [np.where(Y_train == i)[0] for i in range(num_classes)]
    # Oversample minority classes to match the count of the majority class
    for i, indices in enumerate(indices_list):
        if len(indices) < max_class_count:
            oversampled_indices = np.random.choice(indices, size=max_class_count - len(indices), replace=True)
            X_train = np.concatenate((X_train, X_train[oversampled_indices]), axis=0)
            Y_train = np.concatenate((Y_train, Y_train[oversampled_indices]), axis=0)
    return X_train, Y_train


In [5]:
# loading files
checkFile = os.path.isfile("data/dump/train_labels.pkl")

if not checkFile:
    print("Please run the context_encoder notebook to save label file")
    
else:
    file = open('data/dump/train_labels.pkl', 'rb')
    y_train = pickle.load(file)
    y_train = torch.tensor(y_train)
    file.close()
    
file = open('data/dump/label_decoder.pkl', 'rb')
label_decoder = pickle.load(file)

In [6]:
# loading files 2
file_path = 'embed/updated_representation.pkl'

# Load the list from the file using pickle
with open(file_path, 'rb') as file:
    updated_representations = pickle.load(file)

    # Concatenate all the tensors representing individual utterances
    concatenated_tensors = []
    for dialogue_tensor in updated_representations:
        concatenated_tensors.extend(dialogue_tensor)

# Convert the concatenated list of tensors into a single tensor
tensor_utterances = torch.stack(concatenated_tensors)

checkFile = os.path.isfile("data/dump/1st_gat.pkl")
if not checkFile:
    print("Run relation-type encoder before running classifier")
    
else:
    file = open('data/dump/1st_gat.pkl', 'rb')
    cherry_picked_nodes, _ = pickle.load(file)
    file.close()

checkFile = os.path.isfile("data/dump/2nd_gat.pkl")
if not checkFile:
    print("Run relation-type encoder before running classifier")
    
else:
    file = open('data/dump/2nd_gat.pkl', 'rb')
    all_node_feats, _ = pickle.load(file)
    file.close()
    
_ = None
print(cherry_picked_nodes.shape, all_node_feats.shape)


torch.Size([12840, 300]) torch.Size([12840, 300])


EDA

In [7]:
# # Checking the structure of graph
# for n in range(10):
#     tensor_data_np = tensor_utterances[n].detach().numpy()

#     # Plot the data
#     plt.figure(figsize=(10, 5))
#     plt.plot(range(len(tensor_data_np)), tensor_data_np)
#     plt.title('Line Graph of Tensor Data')
#     plt.xlabel('Index')
#     plt.ylabel('Value')
#     plt.show()


In [8]:
# # Normalize the h' (1st GAT)
# data = cherry_picked_nodes.detach().numpy()
# data_normalized = data / np.linalg.norm(data, axis=1, keepdims=True)

# # Compute pairwise cosine similarities
# similarities = cosine_similarity(data_normalized)

# # Print or analyze the similarity matrix
# # print(similarities)
# plt.hist(similarities.flatten(), bins=50, density=True)
# plt.title('Distribution of Cosine Similarities')
# plt.xlabel('Cosine Similarity')
# plt.ylabel('Frequency')
# plt.show()


In [9]:
# # Normalize the h' (2nd GAT)
# data = all_node_feats.detach().numpy()
# data_normalized = data / np.linalg.norm(data, axis=1, keepdims=True)

# # Compute pairwise cosine similarities
# similarities = cosine_similarity(data_normalized)

# # Print or analyze the similarity matrix
# # print(similarities)
# plt.hist(similarities.flatten(), bins=50, density=True)
# plt.title('Distribution of Cosine Similarities')
# plt.xlabel('Cosine Similarity')
# plt.ylabel('Frequency')
# plt.show()


In [10]:
# # Normalize the u' or updated_representations
# data = tensor_utterances.detach().numpy()
# data_normalized = data / np.linalg.norm(data, axis=1, keepdims=True)

# # Compute pairwise cosine similarities
# similarities = cosine_similarity(data_normalized)

# plt.hist(similarities.flatten(), bins=50, density=True)
# plt.title('Distribution of Cosine Similarities')
# plt.xlabel('Cosine Similarity')
# plt.ylabel('Frequency')
# plt.show()


Prep data and EDA


In [11]:
X_train = tensor_utterances
Y_train = y_train

Part 1

In [12]:
# # Define the number of features (k) to select
# k = 100  # Adjust this value as needed

# # Initialize SelectKBest with the desired score function (e.g., f_classif for classification tasks)
# selector = SelectKBest(score_func=f_classif, k=k)

# # Fit SelectKBest on the training data and target variable
# selector.fit(X_train, Y_train)

# # Get the indices of the selected features
# selected_indices = selector.get_support(indices=True)

# # Get the scores of the selected features
# feature_scores = selector.scores_[selected_indices]

# # Display the scores along with their corresponding indices
# # for idx, score in zip(selected_indices, feature_scores):
# #     print(f"Feature index: {idx}, Score: {score}")

# X_train_selected = X_train[:, selected_indices]
# print(X_train_selected.shape)

Selected feature u'

In [13]:
# Apply Min-Max scaling to make the data non-negative
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initialize SelectKBest with the desired score function (e.g., f_classif for classification tasks)
selector = SelectKBest(score_func=f_classif, k=100)
# Assuming X_train is your feature matrix (12840 instances x 300 dimensions)
# and y_train is your target labels

# Initialize a dictionary to store the indices of top features for each class
top_features_by_class = {}
top_scores = {}
# Calculate the relevance of each feature to each class using chi-squared test
for label in range(7):  # Assuming you have 7 classes
    # Create a binary mask indicating instances belonging to the current class
    mask = (Y_train == label)

    # SelectKBest with chi2 as the scoring function
    selector = SelectKBest(score_func=chi2, k=20)  # Select top 20 features
    selector.fit(X_train_scaled, mask)  # Fit SelectKBest to the data
    # Get the indices of the top 20 features
    top_features_indices = np.argsort(selector.scores_)[-20:]
    scores = selector.scores_[top_features_indices]
    # Store the indices in the dictionary
    top_features_by_class[label] = top_features_indices
    top_scores[label] = scores
    
# Print the top features for each class
for label, indices in top_features_by_class.items():
    print(f"Label {label_decoder[label]}: idx {', '.join(map(str, indices))}")
    print(top_scores[label])

Label anger: idx 136, 172, 268, 15, 115, 114, 92, 79, 5, 30, 11, 39, 63, 48, 116, 54, 46, 29, 126, 73
[3.1583449  3.1658403  3.26870161 3.30502297 3.36981681 3.50353464
 3.61726079 3.66916384 3.80734836 4.42814906 4.815598   4.96967837
 5.15128943 5.49479333 5.76961121 6.13670325 6.37304263 6.70896888
 7.08513609 7.24567632]
Label fear: idx 207, 29, 201, 126, 209, 188, 39, 173, 131, 11, 54, 152, 235, 134, 266, 28, 268, 41, 88, 140
[0.73271283 0.73879169 0.74346923 0.78549551 0.85724577 0.89225648
 0.90074511 0.91654341 0.94887727 0.9653221  0.99217627 0.99945618
 1.04076324 1.09991743 1.14010272 1.15816903 1.24364936 1.27709152
 1.34091566 1.77393706]
Label neutral: idx 114, 131, 42, 3, 18, 46, 156, 2, 32, 79, 39, 11, 134, 148, 202, 57, 116, 54, 140, 248
[1.05383006 1.06984748 1.1120533  1.13951188 1.19099611 1.20606924
 1.21872104 1.21980425 1.28606865 1.30701102 1.32600717 1.36228032
 1.36633109 1.38971465 1.46947126 1.52682526 1.58884734 1.97915876
 2.19278263 2.44663865]
Label sadn

In [14]:
concatenated_features_set = set()
for label, indices in top_features_by_class.items():
    concatenated_features_set.update(indices)

concatenated_features_indices = list(concatenated_features_set)

# concatenated_features_indices = []
# for indices in top_features_by_class.values():
#     concatenated_features_indices.extend(indices)


In [15]:
concatenated_features_indices = np.array(concatenated_features_indices)

# Select the desired features from X_train
selected_features1 = tensor_utterances[:, concatenated_features_indices]
print(selected_features1.shape)

torch.Size([12840, 61])


Selected h'

In [16]:
X_train = all_node_feats

In [17]:
# Apply Min-Max scaling to make the data non-negative
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initialize SelectKBest with the desired score function (e.g., f_classif for classification tasks)
selector = SelectKBest(score_func=f_classif, k=100)
# Assuming X_train is your feature matrix (12840 instances x 300 dimensions)
# and y_train is your target labels

# Initialize a dictionary to store the indices of top features for each class
top_features_by_class = {}
top_scores = {}
# Calculate the relevance of each feature to each class using chi-squared test
for label in range(7):  # Assuming you have 7 classes
    # Create a binary mask indicating instances belonging to the current class
    mask = (Y_train == label)

    # SelectKBest with chi2 as the scoring function
    selector = SelectKBest(score_func=chi2, k=20)  # Select top 20 features
    selector.fit(X_train_scaled, mask)  # Fit SelectKBest to the data
    # Get the indices of the top 20 features
    top_features_indices = np.argsort(selector.scores_)[-20:]
    scores = selector.scores_[top_features_indices]
    # Store the indices in the dictionary
    top_features_by_class[label] = top_features_indices
    top_scores[label] = scores
    
# Print the top features for each class
for label, indices in top_features_by_class.items():
    print(f"Label {label_decoder[label]}: idx {', '.join(map(str, indices))}")
    print(top_scores[label])

Label anger: idx 211, 201, 57, 222, 246, 173, 36, 117, 174, 273, 282, 210, 212, 215, 140, 45, 284, 113, 11, 31
[2.13455972 2.15251128 2.15499232 2.22266763 2.2649745  2.27897326
 2.29728324 2.31104255 2.33724553 2.55868147 2.57201992 2.71197793
 2.72490873 3.02564703 3.1079814  3.52187924 3.52235189 3.55939197
 3.63854927 4.08898715]
Label fear: idx 117, 71, 52, 170, 11, 211, 122, 164, 284, 51, 292, 282, 215, 67, 31, 193, 19, 174, 222, 113
[0.55934097 0.56656762 0.57127437 0.60608704 0.65224372 0.65668475
 0.66614341 0.66914212 0.68621878 0.69066966 0.70170609 0.72912575
 0.73839261 0.76271454 0.76857636 0.77391966 0.78695672 0.93910621
 1.05312072 1.07342212]
Label neutral: idx 5, 184, 66, 33, 274, 249, 59, 17, 62, 178, 154, 113, 45, 282, 173, 32, 140, 31, 11, 222
[0.82560005 0.83252277 0.83983303 0.85075594 0.8595786  0.93403658
 0.94532112 0.98889631 0.98916552 0.99180731 1.02903966 1.0591744
 1.20437622 1.21701026 1.30554228 1.4180458  1.52794885 1.54705168
 1.73757364 1.89988713]


In [18]:
concatenated_features_set = set()
for label, indices in top_features_by_class.items():
    concatenated_features_set.update(indices)

concatenated_features_indices = list(concatenated_features_set)

In [19]:
concatenated_features_indices = np.array(concatenated_features_indices)

# Select the desired features from X_train
selected_features2 = tensor_utterances[:, concatenated_features_indices]
print(selected_features2.shape)

torch.Size([12840, 71])


In [20]:
selected_features1[0]

tensor([-0.0008, -0.0238,  0.0514,  0.0302,  0.0129,  0.0094,  0.0147,  0.0397,
        -0.1129,  0.0126, -0.0077,  0.0231, -0.0504,  0.0199, -0.0146,  0.0088,
         0.0034,  0.0209, -0.0678, -0.0067, -0.0448,  0.0393,  0.0117,  0.0035,
        -0.0213,  0.0123,  0.0290, -0.0074,  0.0315, -0.0259,  0.0545,  0.0257,
        -0.0372, -0.0385,  0.0625, -0.0148,  0.0632,  0.0496,  0.0337,  0.0311,
        -0.0082, -0.0564,  0.0866,  0.0339,  0.0394, -0.0050,  0.0256,  0.1272,
         0.0153,  0.0405,  0.0374,  0.0274,  0.0806,  0.0091,  0.0531,  0.0391,
         0.0323, -0.0449, -0.0175,  0.0234,  0.0116])

In [21]:
selected_features2[0]

tensor([-0.0008,  0.0302,  0.0129,  0.0397,  0.0126,  0.0098, -0.0077, -0.1129,
         0.0088, -0.0412, -0.0405, -0.0191,  0.0043,  0.0231,  0.0653,  0.0297,
        -0.0202, -0.0448, -0.0076,  0.0035, -0.0020, -0.0112,  0.0239, -0.0224,
         0.0887, -0.0495, -0.0144,  0.0282, -0.0287, -0.0410,  0.0545, -0.0296,
         0.0248,  0.0257,  0.0510,  0.0227, -0.0083,  0.0191,  0.0006, -0.0148,
        -0.0038, -0.0032,  0.0138, -0.0421,  0.0056,  0.0069,  0.0311, -0.0082,
         0.0106,  0.0866,  0.0394, -0.0208, -0.0058,  0.0151, -0.0192, -0.0533,
        -0.0043,  0.0827, -0.0217, -0.0266,  0.0263, -0.0332,  0.0436,  0.0509,
        -0.0261, -0.0588, -0.0175,  0.0046,  0.0074,  0.0086, -0.0266])

In [22]:
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(selected_features.detach().numpy())

# # Plot the PCA result with color-coded labels
# plt.figure(figsize=(8, 6))
# for label in np.unique(Y_train):
#     indices = Y_train == label
#     plt.scatter(pca_result[indices, 0], pca_result[indices, 1], label=f'{label_decoder[label]}', alpha=0.5)
#     plt.title('PCA Visualization of Selected Utterance Embeddings (Train) with Color-Coded Labels')
#     plt.xlabel('Principal Component 1')
#     plt.ylabel('Principal Component 2')
#     plt.legend()
#     plt.grid(True)
#     plt.show()

3d plottly

In [23]:
# X_train = selected_features
# X_train = X_train / np.linalg.norm(X_train, axis=1, keepdims=True)
# # Perform T-SNE dimensionality reduction
# tsne = TSNE(n_components=3, random_state=42)
# X_tsne = tsne.fit_transform(X_train)

# # Create a Plotly scatter plot
# fig = go.Figure(data=[go.Scatter3d(
#     x=X_tsne[:, 0],
#     y=X_tsne[:, 1],
#     z=X_tsne[:, 2],
#     mode='markers',
#     marker=dict(
#         size=3,
#         color=Y_train,  # Assuming Y_train contains labels for coloring
#         colorscale='Viridis',  # You can choose a different colorscale
#         opacity=0.8
#     )
# )])

# # Update layout
# fig.update_layout(title='3D T-SNE Plot', autosize=False,
#                   width=800, height=800)

# # Show the plot
# fig.show()

In [24]:
# Save the plot as an HTML file
# pio.write_html(fig, '3d_tsne_plot.html')

Selected feature's GAT

current progress (9pm March 6)

In [25]:
# Assuming cnn_bilstm_representations and gat_representations are PyTorch tensors
concatenated_representation = torch.cat((selected_features1, selected_features2), dim=1)

# concatenated_representation1 = torch.cat((tensor_utterances, cherry_picked_nodes), dim=1)
#  concatenated_representation2 = torch.cat((cherry_picked_nodes, all_node_feats), dim=1)
print(concatenated_representation.shape)

torch.Size([12840, 132])


Training and predicting


1st version (only feature engineering and u')

In [26]:
# print(selected_features.shape)
# # Generate sample data
# num_instances = len(selected_features)
# input_dim = selected_features.shape[1]
# num_classes = 7

# X_train = selected_features
# X_train = X_train / np.linalg.norm(X_train, axis=1, keepdims=True)
# Y_train = y_train
# # X_train = torch.randn(num_instances, input_dim)
# # Assuming Y_train is a vector containing the label indices (0 to num_classes-1) for each instance
# # Y_train = torch.randint(0, num_classes, (num_instances,))

# # Calculate class weights to balance the loss function
# class_counts = torch.bincount(Y_train)
# # class_weights = torch.tensor([0.15, 0.03, 0.20, 0.09, 0.15, 0.23, 0.04])

# # Initialize the model
# model = MyNetwork(input_dim, 7, num_classes)
# print(model)
# # Define loss function and optimizer
# criterion = nn.CrossEntropyLoss(weight=None)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# # Train the model
# num_epochs = 3000
# for epoch in range(num_epochs):
#     # Forward pass
#     outputs = model(X_train)
#     loss = criterion(outputs, Y_train)

#     # Backward and optimize
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()

#     if (epoch+1) % 100 == 0:  # Reduced printing frequency for faster training progress monitoring
#         print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


In [27]:
# # Predict on the training data
# with torch.no_grad():
#     outputs = model(X_train)
#     _, predicted = torch.max(outputs.data, 1)

# # Calculate accuracy
# accuracy = (predicted == Y_train).sum().item() / num_instances
# print(f'Training Accuracy: {accuracy * 100:.2f}%')

# unique_labels, label_counts = np.unique(predicted, return_counts=True)

# # Print the counts for each unique label
# for label, count in zip(unique_labels, label_counts):
#     print(f"Label {label}: {count} occurrences")
# print("------------------------")

# unique_labels, label_counts = np.unique(Y_train, return_counts=True)

# # Print the counts for each unique label
# for label, count in zip(unique_labels, label_counts):
#     print(f"Label {label}: {count} occurrences")

2nd version (feature engineered u', class weighting, data resampling, cost-sensitive learning, regularization)

1. Prep data - normalize and create data loader

In [37]:
# Generate sample data
num_instances = len(concatenated_representation)
input_dim = concatenated_representation.shape[1]
num_classes = 7

# Rescale input features
selected_features = concatenated_representation / np.linalg.norm(concatenated_representation, axis=1, keepdims=True)

# Apply data resampling (oversampling) to balance class distribution
X_train, Y_train = oversample_data(concatenated_representation, y_train)

# Calculate class weights for class weighting
class_counts = np.bincount(y_train)
total_instances = np.sum(class_counts)
# class_weights = torch.tensor([total_instances / (num_classes * count) for count in class_counts], dtype=torch.float32)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)
# print(X_train_tensor.shape, Y_train_tensor.shape)
# X_train_tensor = torch.tensor(selected_features)
# Y_train_tensor = torch.tensor(y_train)

unique_labels, label_counts = np.unique(Y_train, return_counts=True)

# Print the counts for each unique label
for label, count in zip(unique_labels, label_counts):
    print(f"Label {label}: {count} occurrences")

print(X_train_tensor.shape, Y_train_tensor.shape)
# Create a TensorDataset
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)

# Define batch size for DataLoader
batch_size = 8

# Create a PyTorch DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize the model
model = MyNetwork(input_dim, 50, num_classes)
print(class_weights)
# Define loss function and optimizer with class weights
criterion = nn.CrossEntropyLoss(weight=None)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

Label 0: 5960 occurrences
Label 1: 5960 occurrences
Label 2: 5960 occurrences
Label 3: 5960 occurrences
Label 4: 5960 occurrences
Label 5: 5960 occurrences
Label 6: 5960 occurrences
torch.Size([41720, 132]) torch.Size([41720])
tensor([1.2229, 5.4269, 0.3078, 2.0939, 1.2311, 0.7934, 5.0392])


2. Training

In [38]:
# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * inputs.size(0)
    
    # Print average loss per epoch
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_dataset):.4f}')

Epoch 1/100: 100%|███████████████████████████████████████████████████████████████| 5215/5215 [00:04<00:00, 1180.05it/s]
Epoch 2/100: 100%|███████████████████████████████████████████████████████████████| 5215/5215 [00:04<00:00, 1154.58it/s]
Epoch 3/100: 100%|███████████████████████████████████████████████████████████████| 5215/5215 [00:04<00:00, 1126.42it/s]
Epoch 4/100: 100%|███████████████████████████████████████████████████████████████| 5215/5215 [00:04<00:00, 1167.55it/s]
Epoch 5/100: 100%|███████████████████████████████████████████████████████████████| 5215/5215 [00:04<00:00, 1139.00it/s]
Epoch 6/100: 100%|███████████████████████████████████████████████████████████████| 5215/5215 [00:04<00:00, 1139.68it/s]
Epoch 7/100: 100%|███████████████████████████████████████████████████████████████| 5215/5215 [00:04<00:00, 1126.26it/s]
Epoch 8/100: 100%|███████████████████████████████████████████████████████████████| 5215/5215 [00:04<00:00, 1121.54it/s]
Epoch 9/100: 100%|██████████████████████

Epoch [100/100], Loss: 1.8030





In [40]:
# Set the model to evaluation mode
model.eval()

# Predict on the training data
with torch.no_grad():
    outputs = model(X_train_tensor)
    _, predicted = torch.max(outputs, 1)

# Convert predicted tensor to numpy array
predicted = predicted.numpy()

# Calculate F1 score per class
f1_per_class = f1_score(Y_train_tensor, predicted, average=None)
f1 = f1_score(Y_train_tensor, predicted, average='macro')

print(f'Training F1 Score: {f1:.4f}')

unique_labels, label_counts = np.unique(predicted, return_counts=True)

# Print F1 score for each class
for i, f1 in enumerate(f1_per_class):
    print(f'F1 Score for Class {label_decoder[i]}: {f1:.4f}')
    
# Print the counts for each unique label
for label, count in zip(unique_labels, label_counts):
    print(f"Label {label_decoder[label]}: {count} occurrences")

Training F1 Score: 0.3103
F1 Score for Class anger: 0.2079
F1 Score for Class fear: 0.3957
F1 Score for Class neutral: 0.1907
F1 Score for Class sadness: 0.3415
F1 Score for Class surprise: 0.3948
F1 Score for Class joy: 0.2340
F1 Score for Class disgust: 0.4074
Label anger: 3103 occurrences
Label fear: 8416 occurrences
Label neutral: 4506 occurrences
Label sadness: 7588 occurrences
Label surprise: 8512 occurrences
Label joy: 3347 occurrences
Label disgust: 6248 occurrences


3rd version is 2nd version + ensembled FC classifier