# CNN for Tag Prediction with ResNet
ResNet architecture originally was invented for time-series-prediction. It is also one of the more efficient architectures when it comes to CNNs. To investigate whether the architecture is applicable to the prediction problem, a smaller architecture is used for testing purposes, ResNet18. This architecture will be used with 1D convolutional layers to predict the new tags occurring in new analysis. To ensure optimal success, the CNN is trained with the analysis data of project hive, which provides the most data points.

In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import optuna
import math

In [2]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', '..', 'Data','Sonar_Issues')

# load SonarQube measure data
df = pd.read_csv(os.path.join(data_dir, 'measures+tags.csv'), low_memory=False)
df = df[df['PROJECT_ID'] == 'hive']
df['SQ_ANALYSIS_DATE'] = pd.to_datetime(df['SQ_ANALYSIS_DATE'])

# sort the df so that the dates are ordered from oldest to newest analysis
df = df.sort_values(by='SQ_ANALYSIS_DATE')
df

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
15553,hive,2008-09-02 23:58:59,613.0,358.0,67469.0,48651.0,29,26933.0,4334.0,2958.0,...,2.6,10623.0,31250.0,31250.0,16728,1204,66,5.7,24.8,"error-handling, clumsy, brain-overload, design..."
15552,hive,2008-09-17 00:28:22,613.0,358.0,67754.0,48873.0,29,27078.0,4340.0,2983.0,...,2.6,10691.0,31428.0,31428.0,16790,1208,66,5.8,24.8,"brain-overload, clumsy"
15551,hive,2008-09-17 20:13:00,613.0,358.0,67865.0,48976.0,29,27145.0,4346.0,2985.0,...,2.6,10701.0,31505.0,31505.0,16785,1208,66,5.7,24.7,"convention, design"
15550,hive,2008-09-18 00:09:17,661.0,397.0,71629.0,51241.0,33,28335.0,4538.0,3215.0,...,2.6,11061.0,32889.0,32889.0,17789,1228,74,5.9,24.8,"error-handling, clumsy, brain-overload, design..."
15549,hive,2008-09-18 17:37:59,664.0,399.0,72263.0,51707.0,33,28559.0,4592.0,3235.0,...,2.6,11206.0,33041.0,33041.0,17659,1224,75,5.9,24.4,"error-handling, clumsy, brain-overload, bad-pr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13702,hive,2015-02-27 21:09:45,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,2.3,119218.0,431125.0,431125.0,139347,7774,791,9.3,13.0,"error-handling, clumsy, design, suspicious, pi..."
13701,hive,2015-02-27 21:30:05,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,2.3,119218.0,431125.0,431125.0,139347,7774,791,9.3,13.0,pitfall
13700,hive,2015-02-27 23:08:33,8468.0,3872.0,1087272.0,742901.0,387,357917.0,62390.0,76071.0,...,2.3,120954.0,437096.0,437096.0,140709,7913,810,9.3,12.9,"convention, pitfall"
13699,hive,2015-03-02 18:18:35,8477.0,3882.0,1088466.0,743721.0,387,358306.0,62458.0,76112.0,...,2.3,121067.0,437585.0,437585.0,140806,7917,813,9.3,12.9,"error-handling, design, unused, suspicious"


## Data preparation

### Prepare labels
The tags are given as a list of unique tags per analysis. However, to make it assessible for a CNN, the categories need to be one-hot-encoded. <br>
For this, all tags present in the data need to be defined. Afterwards, the multi-level-binarizer is used to transform the list of unique tags for each observation into a one-hot-encoded array.

In [3]:
all_tags = ['convention', 'brain-overload','unused','error-handling','bad-practice','pitfall',
            'clumsy','suspicious','design','antipattern','redundant','confusing','performance','obsolete']

In [4]:
# transform TAGS strings to lists
df.loc[:, 'TAGS'] = df['TAGS'].str.split(',')
# remove whitespaces
df.loc[:, 'TAGS'] = df['TAGS'].apply(lambda x: [item.strip() for item in x])

# save TAGS as raw_labels to be further processed
raw_labels = df['TAGS']
raw_labels

15553    [error-handling, clumsy, brain-overload, desig...
15552                             [brain-overload, clumsy]
15551                                 [convention, design]
15550    [error-handling, clumsy, brain-overload, desig...
15549    [error-handling, clumsy, brain-overload, bad-p...
                               ...                        
13702    [error-handling, clumsy, design, suspicious, p...
13701                                            [pitfall]
13700                                [convention, pitfall]
13699         [error-handling, design, unused, suspicious]
13698       [brain-overload, unused, antipattern, pitfall]
Name: TAGS, Length: 1856, dtype: object

In [5]:
# initialise mlb with all tag categories
mlb = MultiLabelBinarizer(classes=all_tags)
# fit the mlb with the list of lists of raw labels
mlb.fit(raw_labels)

print(f"MLB classes (order of one-hot columns): {mlb.classes_}")
num_classes = len(mlb.classes_)
print(f"Total number of possible labels: {num_classes}")

MLB classes (order of one-hot columns): ['convention' 'brain-overload' 'unused' 'error-handling' 'bad-practice'
 'pitfall' 'clumsy' 'suspicious' 'design' 'antipattern' 'redundant'
 'confusing' 'performance' 'obsolete']
Total number of possible labels: 14


In [6]:
# encode raw labels with one-hot method
one_hot_encoded_labels_array = mlb.transform(raw_labels)
print(f"\nOne-hot encoded labels (NumPy array):\n{one_hot_encoded_labels_array}")


One-hot encoded labels (NumPy array):
[[1 1 1 ... 1 1 1]
 [0 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]]


### Split data into timewindows
To accomodate the prediction task, the predictors and according labels need to be split into timewindows, to set up a shift window attention approach.

In [7]:
def scale_predictors(df, label):
    """This function scales numerical predictor variables. The label remains unscaled."""
    columns_to_scale = [col for col in df.select_dtypes(include=['number']) if col != label]
    scaler = StandardScaler()
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    return df

df_scaled = scale_predictors(df.select_dtypes(include='number'), 'TAGS')
df_scaled

Unnamed: 0,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,COMPLEXITY,CLASS_COMPLEXITY,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY
15553,-1.611448,-1.753882,-1.636504,-1.644758,-1.867228,-1.632894,-1.687940,-1.955492,-1.686843,0.028857,3.890931,-1.700907,-1.642291,-1.642291,-1.530234,-1.369866,-1.494250,-2.799625,2.711566
15552,-1.611448,-1.753882,-1.635665,-1.643786,-1.867228,-1.631576,-1.687632,-1.954361,-1.685957,0.028857,3.890931,-1.699043,-1.640975,-1.640975,-1.529098,-1.368268,-1.494250,-2.740282,2.711566
15551,-1.611448,-1.753882,-1.635338,-1.643336,-1.867228,-1.630968,-1.687323,-1.954270,-1.685602,0.094013,3.890931,-1.698769,-1.640406,-1.640406,-1.529189,-1.368268,-1.494250,-2.799625,2.680384
15550,-1.593440,-1.718480,-1.624254,-1.633426,-1.831254,-1.620154,-1.677453,-1.943865,-1.676188,-0.362080,3.890931,-1.688900,-1.630172,-1.630172,-1.510786,-1.360281,-1.468165,-2.680939,2.711566
15549,-1.592314,-1.716664,-1.622388,-1.631387,-1.831254,-1.618119,-1.674677,-1.942960,-1.673064,-0.296924,3.890931,-1.684926,-1.629048,-1.629048,-1.513169,-1.361878,-1.464904,-2.680939,2.586839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13702,1.282620,1.360605,1.320827,1.343395,1.145643,1.329741,1.246249,1.307311,1.277539,-0.492393,-0.656656,1.275940,1.314624,1.314624,0.717393,1.253917,0.869667,-0.663264,-0.967876
13701,1.282620,1.360605,1.320827,1.343395,1.145643,1.329741,1.246249,1.307311,1.277539,-0.492393,-0.656656,1.275940,1.314624,1.314624,0.717393,1.253917,0.869667,-0.663264,-0.967876
13700,1.335519,1.435948,1.366436,1.392846,1.352497,1.374703,1.296525,1.352144,1.323948,-0.557549,-0.656656,1.323528,1.358777,1.358777,0.742359,1.309428,0.931618,-0.663264,-0.999058
13699,1.338896,1.445025,1.369952,1.396434,1.352497,1.378238,1.300021,1.353999,1.326650,-0.557549,-0.656656,1.326626,1.362393,1.362393,0.744137,1.311025,0.941400,-0.663264,-0.999058


In [8]:
total_original_time_points = 1856
n_features = 19
num_unique_labels = 14

print(f"Original df_scaled shape: {df_scaled.shape}")
print(f"Original labels shape: {one_hot_encoded_labels_array.shape}")

# define time window length (30 analysis)
timesteps = 30
# stride defines the overlap between time windows - time windows overlap for half their analysis here
stride = timesteps // 2

# generate sliding windows samples
X_windows_list = []
y_labels_list = []
num_generated_samples = total_original_time_points - timesteps + 1

for i in range(0, num_generated_samples, stride):
    window_data = df_scaled.iloc[i : i + timesteps].values
    X_windows_list.append(window_data)
    label_for_window = one_hot_encoded_labels_array[i + timesteps - 1]
    y_labels_list.append(label_for_window)

# convert resulting lists of numpy array to single pytorch tensors by stacking all generated windows and their labels into one np array
all_X_windows_np = np.array(X_windows_list)
all_y_labels_np = np.array(y_labels_list)

print(f"\nAfter windowing:")
print(f"  all_X_windows_np shape: {all_X_windows_np.shape} (samples, timesteps, features)")
print(f"  all_y_labels_np shape: {all_y_labels_np.shape} (samples, num_unique_labels)")


# convert arrays to pytorch tensors, permute X for Conv1D
all_X_data_tensor = torch.tensor(all_X_windows_np, dtype=torch.float32).permute(0, 2, 1)
all_y_data_tensor = torch.tensor(all_y_labels_np, dtype=torch.float32)

print(f"\nFinal PyTorch tensor shapes (before splitting):")
print(f"  all_X_data_tensor shape: {all_X_data_tensor.shape}")
print(f"  all_y_data_tensor shape: {all_y_data_tensor.shape}")
# check if y data is torch.float32 for BCEWithLogitsLoss
print(f"  all_y_data_tensor dtype: {all_y_data_tensor.dtype}")

Original df_scaled shape: (1856, 19)
Original labels shape: (1856, 14)

After windowing:
  all_X_windows_np shape: (122, 30, 19) (samples, timesteps, features)
  all_y_labels_np shape: (122, 14) (samples, num_unique_labels)

Final PyTorch tensor shapes (before splitting):
  all_X_data_tensor shape: torch.Size([122, 19, 30])
  all_y_data_tensor shape: torch.Size([122, 14])
  all_y_data_tensor dtype: torch.float32


### Train-Validation-Test-Split
For this model, the data is split into 3 different sets, using training and validation set for hyperparameter optimisation. The training and testing set are used to train and evaluate the model with optimised hyperparameters.

In [9]:
# split data into training, validation and testing set
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

train_split_idx = int(all_X_data_tensor.shape[0] * train_ratio)
val_split_idx = int(all_X_data_tensor.shape[0] * (train_ratio + val_ratio))

X_train_tensor = all_X_data_tensor[:train_split_idx]
y_train_tensor = all_y_data_tensor[:train_split_idx]

X_val_tensor = all_X_data_tensor[train_split_idx:val_split_idx]
y_val_tensor = all_y_data_tensor[train_split_idx:val_split_idx]

X_test_tensor = all_X_data_tensor[val_split_idx:]
y_test_tensor = all_y_data_tensor[val_split_idx:]

print(f"\nSplit dataset sizes:")
print(f"  Train set X shape: {X_train_tensor.shape}, y shape: {y_train_tensor.shape}")
print(f"  Validation set X shape: {X_val_tensor.shape}, y shape: {y_val_tensor.shape}")
print(f"  Test set X shape: {X_test_tensor.shape}, y shape: {y_test_tensor.shape}")


# creating pytorch dataset objects for dataloader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)


Split dataset sizes:
  Train set X shape: torch.Size([97, 19, 30]), y shape: torch.Size([97, 14])
  Validation set X shape: torch.Size([12, 19, 30]), y shape: torch.Size([12, 14])
  Test set X shape: torch.Size([13, 19, 30]), y shape: torch.Size([13, 14])


## 1D ResNet18 architecture
The architecture follows a ResNet18 architecture ([Diagram of original ResNet18 architecture](https://www.researchgate.net/profile/Poorya-Mohammadinasab/publication/373653509/figure/fig1/AS:11431281186311794@1693861891854/Original-ResNet-18-Architecture.png)). It is adapted to fit a 1D-problem, meaning that each predictor is passed as a 1D-vector. ResNet has the ability to "skip connections": this allows information to bypass some layers, helping the network learn more effectively even when it's very deep. It consists of a convolutional layer with a batch normalisation, ReLu activation and a max pool layer at the beginning. This is followed by four blocks. Each block contains two convolutional layers, again followed by batch normalization and ReLU activation. Once the data has passed through all four building block stages, an average pooling layer summarizes the learned features. This prepares the data for the final step: a linear transformation. This final layer outputs raw logits, which are essentially scores for each possible "tag" or category. These scores are then used to determine the probabilities that a specific tag appears in your analysis, making it suitable for multi-classification tasks where multiple tags can apply.

In [10]:
class BasicBlock1D(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(BasicBlock1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        # this will ensure 'identity' has the same dimensions as 'out' if needed
        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class ResNet1D(nn.Module):
    def __init__(self, block, layers, in_channels, num_classes):
        super(ResNet1D, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv1d(in_channels, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm1d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=1)

        self.avgpool = nn.AdaptiveAvgPool1d(1)
        # no activation function, outputs raw logits
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, num_blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(out_channels * block.expansion),
            )
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, num_blocks):
            layers.append(block(self.in_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.maxpool(out)

        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        out = self.avgpool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)

        return out

def resnet18_1d(in_channels, num_classes):
    """
    ResNet-18 (1D)
    Equivalent to original ResNet-18 for images, but with Conv1d
    layers = [2, 2, 2, 2] means 2 BasicBlocks in each of the 4 stages
    """
    return ResNet1D(BasicBlock1D, [2, 2, 2, 2], in_channels, num_classes)

### Hyperparameter Optimisation and Training
Hyperparameter Optimisation is done using SciKit Learn's Optuna. Optuna tries promising combinations of hyperparameters over 50 different trials.

In [11]:
# setup function for Optuna
def objective(trial):
    # hyperparameters to test in trial
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128])
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    
    # initialise model
    model = resnet18_1d(in_channels=n_features, num_classes=num_unique_labels)
    model.to(device)

    # create dataloarders for training and validation data
    train_loader_optuna = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader_optuna = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # initialise optimisers
    if optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == "RMSprop":
        optimizer = optim.RMSprop(model.parameters(), lr=lr)
    else:
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    criterion = nn.BCEWithLogitsLoss()

    # training loops for trial (5 epochs per trial)
    N_EPOCHS_PER_TRIAL = 5
    best_val_loss_trial = float('inf')

    # run training
    for epoch in range(N_EPOCHS_PER_TRIAL):
        model.train()
        for batch_idx, (data, labels) in enumerate(train_loader_optuna):
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # evaluate model performance on validation set
        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for batch_idx, (data, labels) in enumerate(val_loader_optuna):
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * data.size(0)
        epoch_val_loss = running_val_loss / len(val_loader_optuna.dataset)

        # initialize Optuna's pruning for unpromising trials (saves runtime)
        trial.report(epoch_val_loss, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        if epoch_val_loss < best_val_loss_trial:
            best_val_loss_trial = epoch_val_loss

    # return smallest validation loss
    return best_val_loss_trial

In [12]:
# hyperparameter optimisation function
if __name__ == '__main__':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # create Optuna study that minimises validation loss
    study = optuna.create_study(direction="minimize", study_name="resnet1d_hpo")

    # run optimisation with 50 trials
    print("\nStarting Hyperparameter Optimization...")
    study.optimize(objective, n_trials=50)

    print("\nHyperparameter Optimization Finished.")
    print(f"Number of finished trials: {len(study.trials)}")
    print(f"Best trial parameters: {study.best_trial.params}")
    print(f"Best trial validation loss: {study.best_trial.value:.4f}")

    # extract parameters for the trial with the lowest validation loss
    print("\nRetraining the final model with best hyperparameters...")
    best_params = study.best_trial.params

    final_lr = best_params["lr"]
    final_batch_size = best_params["batch_size"]
    final_optimizer_name = best_params["optimizer"]

    final_model = resnet18_1d(in_channels=n_features, num_classes=num_unique_labels)
    final_model.to(device)

    # load test set as well as the others for the final model with optimised hyperparameters
    final_train_loader = DataLoader(train_dataset, batch_size=final_batch_size, shuffle=True)
    final_val_loader = DataLoader(val_dataset, batch_size=final_batch_size, shuffle=False)
    final_test_loader = DataLoader(test_dataset, batch_size=final_batch_size, shuffle=False)

    # weighting
    training_labels = y_train_tensor.cpu().numpy()

    num_classes = training_labels.shape[1]
    positive_counts = np.sum(training_labels == 1, axis=0)
    negative_counts = np.sum(training_labels == 0, axis=0)

    # calculate pos_weight for each class
    pos_weight = torch.zeros(num_classes)
    epsilon = 1e-6

    for i in range(num_classes):
        if positive_counts[i] == 0:
            print(f"Warning: Class {i} has no positive samples in the training data.")
            pos_weight[i] = torch.tensor(1.0)
        else:
            pos_weight[i] = torch.tensor(negative_counts[i] / (positive_counts[i] + epsilon))

    # add pos_weight to device
    pos_weight = pos_weight.to(device)
    
    print(f"Calculated pos_weight for each class: {pos_weight}")
    
    # intiating Loss Function with pos_weight
    final_criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    if final_optimizer_name == "Adam":
        final_optimizer = optim.Adam(final_model.parameters(), lr=final_lr)
    elif final_optimizer_name == "RMSprop":
        final_optimizer = optim.RMSprop(final_model.parameters(), lr=final_lr)
    else: # SGD
        final_optimizer = optim.SGD(final_model.parameters(), lr=final_lr, momentum=0.9)

    # train cnn with optimised hyperparameters
    FINAL_NUM_EPOCHS = 50
    final_best_val_loss = float('inf')
    final_epochs_no_improve = 0
    # for early-stopping (after 10 epochs of no improvement)
    final_patience = 10

    # model path for saving the best model
    model_save_dir = os.path.join(current_dir, '..', '..', '..', 'Data', 'Models', 'CodeSmellTags', 'ResNet_1D')
    final_model_save_path = os.path.join(model_save_dir, 'optimised_best_resnet_cnn.pth')

    # run training with optimised hyperparameters
    for epoch in range(FINAL_NUM_EPOCHS):
        final_model.train()
        running_train_loss = 0.0
        for batch_idx, (data, labels) in enumerate(final_train_loader):
            data, labels = data.to(device), labels.to(device)
            final_optimizer.zero_grad()
            outputs = final_model(data)
            loss = final_criterion(outputs, labels)
            loss.backward()
            final_optimizer.step()
            running_train_loss += loss.item() * data.size(0)
        epoch_train_loss = running_train_loss / len(final_train_loader.dataset)

        final_model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for batch_idx, (data, labels) in enumerate(final_val_loader):
                data, labels = data.to(device), labels.to(device)
                outputs = final_model(data)
                loss = final_criterion(outputs, labels)
                running_val_loss += loss.item() * data.size(0)
        epoch_val_loss = running_val_loss / len(final_val_loader.dataset)

        print(f"Final Training - Epoch {epoch+1}/{FINAL_NUM_EPOCHS}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}")

        # save model if validation loss decreases
        if epoch_val_loss < final_best_val_loss:
            final_best_val_loss = epoch_val_loss
            final_epochs_no_improve = 0
            torch.save(final_model.state_dict(), final_model_save_path)
            print(f"  Final model val loss improved. Saving to {final_model_save_path}. Best Loss: {final_best_val_loss:.4f}")
        else:
            final_epochs_no_improve += 1
            if final_epochs_no_improve >= final_patience:
                print(f"  Final training early stopping triggered after {epoch+1} epochs.")
                break

    print("\nFinal model training complete.")

[I 2025-05-30 17:27:49,789] A new study created in memory with name: resnet1d_hpo


Using device: cpu

Starting Hyperparameter Optimization...


[I 2025-05-30 17:27:56,271] Trial 0 finished with value: 0.6766470074653625 and parameters: {'lr': 0.00020652648689075718, 'batch_size': 128, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.6766470074653625.
[I 2025-05-30 17:28:08,073] Trial 1 finished with value: 7.0781683921813965 and parameters: {'lr': 6.068281157818735e-05, 'batch_size': 16, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.6766470074653625.
[I 2025-05-30 17:28:20,458] Trial 2 finished with value: 22.388578414916992 and parameters: {'lr': 0.015375219589336199, 'batch_size': 16, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.6766470074653625.
[I 2025-05-30 17:28:24,402] Trial 3 finished with value: 0.5280751585960388 and parameters: {'lr': 0.0028463669690439535, 'batch_size': 64, 'optimizer': 'Adam'}. Best is trial 3 with value: 0.5280751585960388.
[I 2025-05-30 17:28:26,987] Trial 4 finished with value: 0.6991267204284668 and parameters: {'lr': 0.0014616391228002826, 'batch_size': 128, 'optimizer': 'SGD'}.


Hyperparameter Optimization Finished.
Number of finished trials: 50
Best trial parameters: {'lr': 0.0005383280210998608, 'batch_size': 32, 'optimizer': 'RMSprop'}
Best trial validation loss: 0.4809

Retraining the final model with best hyperparameters...
Calculated pos_weight for each class: tensor([ 1.0638,  1.6216,  2.4643,  1.6216,  2.2333,  3.2174,  3.4091,  2.5926,
         3.2174,  4.7059,  9.7778, 47.5000, 15.1667, 95.9999])
Final Training - Epoch 1/50, Train Loss: 1.5180, Val Loss: 1.5510
  Final model val loss improved. Saving to C:\Users\carol\Dropbox\DataScience\Semester4\MasterProjectSonarQube\Scripts\Model\tagPrediction\..\..\..\Data\Models\CodeSmellTags\ResNet_1D\optimised_best_resnet_cnn.pth. Best Loss: 1.5510
Final Training - Epoch 2/50, Train Loss: 1.3668, Val Loss: 1.5872
Final Training - Epoch 3/50, Train Loss: 1.1896, Val Loss: 1.7864
Final Training - Epoch 4/50, Train Loss: 1.1545, Val Loss: 1.9187
Final Training - Epoch 5/50, Train Loss: 1.0074, Val Loss: 2.2691


### Evaluation

In [13]:
# evaluate model performance on test set
print("\nEvaluating on test set...")
# load the best model saved
final_model.load_state_dict(torch.load(final_model_save_path))
final_model.to(device)
final_model.eval()

test_loss = 0.0
all_preds = []
all_targets = []

with torch.no_grad():
    for batch_idx, (data, labels) in enumerate(final_test_loader):
        data, labels = data.to(device), labels.to(device)
        outputs = final_model(data)
        loss = final_criterion(outputs, labels)
        test_loss += loss.item() * data.size(0)
        probabilities = torch.sigmoid(outputs)
        all_preds.append(probabilities.cpu().numpy())
        all_targets.append(labels.cpu().numpy())

avg_test_loss = test_loss / len(final_test_loader.dataset)
print(f"Test Loss: {avg_test_loss:.4f}")

all_preds = np.vstack(all_preds)
all_targets = np.vstack(all_targets)

# calculate detailed metrics
threshold = 0.5
binary_preds = (all_preds > threshold).astype(int)

print("\nDetailed Test Set Metrics:")
print(f"  Micro F1-score: {f1_score(all_targets, binary_preds, average='micro'):.4f}")
print(f"  Micro Precision: {precision_score(all_targets, binary_preds, average='micro'):.4f}")
print(f"  Micro Recall: {recall_score(all_targets, binary_preds, average='micro'):.4f}")
print(f"  Macro F1-score: {f1_score(all_targets, binary_preds, average='macro'):.4f}")
print(f"  Macro Precision: {precision_score(all_targets, binary_preds, average='macro'):.4f}")
print(f"  Macro Recall: {recall_score(all_targets, binary_preds, average='macro'):.4f}")


Evaluating on test set...
Test Loss: 0.8045

Detailed Test Set Metrics:
  Micro F1-score: 0.3099
  Micro Precision: 0.2821
  Micro Recall: 0.3438
  Macro F1-score: 0.0836
  Macro Precision: 0.0604
  Macro Recall: 0.1429


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Result
CNNs are not suited to model the problem, since there is too little data to properly train, validate and test a CNN. Even though the model architecture fits the issue in theory, the result isn't properly trained enough to give a strong idea about performance.