# Similar to Baseline Learning, we start off by training model on debugging dataset in the same manner described in the Baseline Learning notebook

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader



# Setting up the datasets, both main and debugging

In [None]:
debugging_dataset = pd.read_pickle('debugging_dataset.pkl')
working_dataset = pd.read_pickle('working_dataset.pkl')

# Grade columns (GT)
grade_columns = ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F', 'W']

# Categorical columns (features)
categorical_columns = ['Year', 'Term', 'Subject', 'Sched Type', 'Number', 'Course Title']


FileNotFoundError: [Errno 2] No such file or directory: 'debugging_dataset.pkl'

# Dataloader Definition and initialization

## We will one-hot encode each category first, computing required embedding space for each category.

In [None]:
max_categories = {
    col: max(debugging_dataset[col].nunique(), working_dataset[col].nunique())
    for col in categorical_columns
}
print("Maximum categories for each column:", max_categories)


split manner is as follow: 0.75: 0.15: 0.15 for train, val, test

In [None]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

debug_train, debug_temp = train_test_split(debugging_dataset, test_size=(1 - train_ratio), random_state=42)
debug_val, debug_test = train_test_split(debug_temp, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)

work_train, work_temp = train_test_split(working_dataset, test_size=(1 - train_ratio), random_state=42)
work_val, work_test = train_test_split(work_temp, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)

print(f"Debugging Train: {len(debug_train)}, Val: {len(debug_val)}, Test: {len(debug_test)}")
print(f"Working Train: {len(work_train)}, Val: {len(work_val)}, Test: {len(work_test)}")


In [None]:
def one_hot_encode_fixed(dataframe, column, max_categories):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    one_hot = encoder.fit_transform(dataframe[[column]])

    padded = np.zeros((len(dataframe), max_categories))
    padded[:, :one_hot.shape[1]] = one_hot
    return padded

def encode_categorical_features(dataframe, categorical_columns, max_categories):
    one_hot_encoded = {
        col: one_hot_encode_fixed(dataframe, col, max_categories[col])
        for col in categorical_columns
    }
    return one_hot_encoded


In [None]:
debug_train_encoded = encode_categorical_features(debug_train, categorical_columns, max_categories)
debug_val_encoded = encode_categorical_features(debug_val, categorical_columns, max_categories)
debug_test_encoded = encode_categorical_features(debug_test, categorical_columns, max_categories)

work_train_encoded = encode_categorical_features(work_train, categorical_columns, max_categories)
work_val_encoded = encode_categorical_features(work_val, categorical_columns, max_categories)
work_test_encoded = encode_categorical_features(work_test, categorical_columns, max_categories)

for col in categorical_columns:
    print(f"{col} (Debug Train): {debug_train_encoded[col].shape}")
    print(f"{col} (Work Train): {work_train_encoded[col].shape}")


## Build the dataloader and initliaize it, making sure to shuffle the training sets to avoid overfitting

In [None]:
class GradeDataset(Dataset):
    def __init__(self, encoded_features, grade_columns, targets_dataframe):
        self.encoded_features = {col: torch.tensor(encoded_features[col], dtype=torch.float32) for col in encoded_features}
        self.targets = torch.tensor(targets_dataframe[grade_columns].values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # we further drop the feature 'Course Title'
        features = {col: self.encoded_features[col][idx] for col in self.encoded_features if col != 'Course Title'}
        # features = {col: self.encoded_features[col][idx] for col in self.encoded_features}
        return features, self.targets[idx]


In [None]:
debug_train_dataset = GradeDataset(debug_train_encoded, grade_columns, debug_train)
debug_val_dataset = GradeDataset(debug_val_encoded, grade_columns, debug_val)
debug_test_dataset = GradeDataset(debug_test_encoded, grade_columns, debug_test)

work_train_dataset = GradeDataset(work_train_encoded, grade_columns, work_train)
work_val_dataset = GradeDataset(work_val_encoded, grade_columns, work_val)
work_test_dataset = GradeDataset(work_test_encoded, grade_columns, work_test)

debug_train_loader = DataLoader(debug_train_dataset, batch_size=64, shuffle=True)
debug_val_loader = DataLoader(debug_val_dataset, batch_size=64, shuffle=False)
debug_test_loader = DataLoader(debug_test_dataset, batch_size=64, shuffle=False)

work_train_loader = DataLoader(work_train_dataset, batch_size=64, shuffle=True)
work_val_loader = DataLoader(work_val_dataset, batch_size=64, shuffle=False)
work_test_loader = DataLoader(work_test_dataset, batch_size=64, shuffle=False)



In [None]:
# check the features in the datset
for features, targets in debug_train_loader:
    print(f"Features keys and shapes: {features.keys()}")
    print(f"Targets shape: {targets.shape}")
    break
for features, targets in work_train_loader:
    print(f"Features keys and shapes: {features.keys()}")
    print(f"Targets shape: {targets.shape}")
    break

## Build 2-layer NN model

In [None]:
import torch.nn.functional as F
import torch.nn as nn
class GradePredictor(nn.Module):
    def __init__(self, feature_sizes, output_dim):
        super(GradePredictor, self).__init__()

        # Create layers for each feature
        self.feature_layers = nn.ModuleDict({
            col: nn.Sequential(
                nn.Linear(size, 64),
                nn.ReLU(),
                nn.Dropout(0.3)
            )
            for col, size in feature_sizes.items()
        })

        # Combine outputs from all features
        self.fc = nn.Sequential(
            nn.Linear(len(feature_sizes) * 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, output_dim)
        )

    def forward(self, features):
        feature_outputs = [layer(features[col]) for col, layer in self.feature_layers.items()]

        combined = torch.cat(feature_outputs, dim=1)
        logits = self.fc(combined)

        percentages = F.softmax(logits, dim=1)
        return percentages


### just small experiment--train model for each grade separately

In [None]:
class GradePredictor(nn.Module):
    def __init__(self, feature_sizes):
        super(GradePredictor, self).__init__()
        self.feature_layers = nn.ModuleDict({
            col: nn.Sequential(
                nn.Linear(size, 64),
                nn.ReLU(),
                nn.Dropout(0.3)
            )
            for col, size in feature_sizes.items()
        })
        self.fc = nn.Sequential(
            nn.Linear(len(feature_sizes) * 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)  # Output 1 value for this specific grade
        )

    def forward(self, features):
        feature_outputs = [layer(features[col]) for col, layer in self.feature_layers.items()]
        combined = torch.cat(feature_outputs, dim=1)
        output = self.fc(combined)
        return output


def train_model_for_grade(grade, train_loader, val_loader, feature_sizes, epochs=20, lr=0.0001):
    print(f"\nTraining model for grade: {grade}")

    model = GradePredictor(feature_sizes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for features, targets in train_loader:
            features = {col: features[col].to(device) for col in features}
            targets = targets[:, grade_columns.index(grade)].to(device)
            optimizer.zero_grad()
            outputs = model(features).squeeze(1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_losses.append(train_loss / len(train_loader))

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in val_loader:
                features = {col: features[col].to(device) for col in features}
                targets = targets[:, grade_columns.index(grade)].to(device)
                outputs = model(features).squeeze(1)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        val_losses.append(val_loss / len(val_loader))

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

    return model, train_losses, val_losses


trained_models = {}
grade_results = {}

for grade in grade_columns:
    model, train_losses, val_losses = train_model_for_grade(
        grade, debug_train_loader, debug_val_loader, feature_sizes, epochs=20, lr=0.0001
    )
    trained_models[grade] = model
    grade_results[grade] = {
        "train_losses": train_losses,
        "val_losses": val_losses
    }

# Plot training and validation loss for each grade
for grade in grade_columns:
    plt.figure(figsize=(8, 5))
    plt.plot(grade_results[grade]["train_losses"], label="Train Loss")
    plt.plot(grade_results[grade]["val_losses"], label="Validation Loss")
    plt.title(f"Loss for Grade: {grade}")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()



In [None]:
# get the average loss
overall_train_losses = []
overall_val_losses = []
for grade in grade_columns:
    overall_train_losses.append(grade_results[grade]["train_losses"][-1])
    overall_val_losses.append(grade_results[grade]["val_losses"][-1])

print(f"Overall Train Loss: {np.mean(overall_train_losses):.4f}")
print(f"Overall Validation Loss: {np.mean(overall_val_losses):.4f}")

## Train the model on debugging set for a dry-run

In [None]:
feature_sizes = {col: debug_train_encoded[col].shape[1] for col in debug_train_encoded}
# drop the 'Course Title'
feature_sizes.pop('Course Title', None)
print(feature_sizes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
output_dim = len(grade_columns)
model = GradePredictor(feature_sizes, output_dim).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10):
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for features, targets in train_loader:
            features = {col: features[col].to(device) for col in features}
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            # loss = criterion(F.softmax(outputs, dim=1), targets)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_losses.append(train_loss / len(train_loader))

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in val_loader:
                features = {col: features[col].to(device) for col in features}
                targets = targets.to(device)
                outputs = model(features)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        val_losses.append(val_loss / len(val_loader))

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

    return train_losses, val_losses

# Train
train_losses, val_losses = train_model(model, debug_train_loader, debug_val_loader, optimizer, criterion, epochs=20)

# plot
import matplotlib.pyplot as plt

plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()



# Now, we build the model on the main dataset and do the hyper parameter settings

# Analyze the effect of batch size with Adam --[16, 32, 128, 256, 512]
also, we furthered dropped the "Course Title" as it kept leading the model to overfitting

In [None]:
batch_sizes = [16, 32, 64, 128, 256, 512]
batch_results = {}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for batch_size in batch_sizes:
    print(f"\nTraining with batch size: {batch_size}")

    train_loader = DataLoader(work_train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(work_val_dataset, batch_size=batch_size, shuffle=False)

    model = GradePredictor(feature_sizes, output_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=1e-4)  # Added weight decay
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    patience = 3
    trigger_times = 0

    for epoch in range(10):
        model.train()
        train_loss = 0.0
        for features, targets in train_loader:
            features = {col: features[col].to(device) for col in features}
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in val_loader:
                features = {col: features[col].to(device) for col in features}
                targets = targets.to(device)
                outputs = model(features)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
        val_losses.append(val_loss / len(val_loader))

        scheduler.step(val_losses[-1])

        print(f"Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            trigger_times = 0
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print("Early stopping!")
                break

    batch_results[batch_size] = {
        'train_losses': train_losses,
        'val_losses': val_losses
    }

# Plot results
plt.figure(figsize=(10, 6))
for batch_size, results in batch_results.items():
    plt.plot(results['val_losses'], label=f'Batch size {batch_size}')
plt.title('Validation Loss vs. Epochs for Batch Size')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()

## As the model gets easily overfit to the current data points, with batch size(larger points to deal with), the model takes more epochs to converge

## Then we investigate effects of different optimizers -- [SGD, Adam, RMSprop] with the batch size 64

In [None]:
import itertools

optimizers = {
    'Adam': torch.optim.Adam,
    'SGD': torch.optim.SGD,
    'RMSprop': torch.optim.RMSprop
}
learning_rates = [0.0001, 0.00001, 0.000001]

opt_lr_results = {}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for opt_name, opt_class in optimizers.items():
    for lr in learning_rates:
        print(f"\nTraining with optimizer: {opt_name}, Learning rate: {lr}")

        train_loader = DataLoader(work_train_dataset, batch_size=64, shuffle=True)
        val_loader = DataLoader(work_val_dataset, batch_size=64, shuffle=False)

        model = GradePredictor(feature_sizes, output_dim).to(device)
        optimizer = opt_class(model.parameters(), lr=lr, weight_decay=1e-4)
        criterion = nn.MSELoss()
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

        train_losses, val_losses = [], []
        best_val_loss = float('inf')
        patience = 3
        trigger_times = 0

        # Training
        for epoch in range(10):
            model.train()
            train_loss = 0.0
            for features, targets in train_loader:
                features = {col: features[col].to(device) for col in features}
                targets = targets.to(device)
                optimizer.zero_grad()
                outputs = model(features)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            train_losses.append(train_loss / len(train_loader))

            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for features, targets in val_loader:
                    features = {col: features[col].to(device) for col in features}
                    targets = targets.to(device)
                    outputs = model(features)
                    loss = criterion(outputs, targets)
                    val_loss += loss.item()
            val_losses.append(val_loss / len(val_loader))

            scheduler.step(val_losses[-1])

            print(f"Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                trigger_times = 0
            else:
                trigger_times += 1
                if trigger_times >= patience:
                    print("Early stopping!")
                    break

        opt_lr_results[(opt_name, lr)] = {
            'train_losses': train_losses,
            'val_losses': val_losses
        }

# Plot results
plt.figure(figsize=(12, 8))
for (opt_name, lr), results in opt_lr_results.items():
    label = f"{opt_name}, LR={lr}"
    plt.plot(results['val_losses'], label=label)
plt.title('Validation Loss vs. Epochs for Optimizer and Learning Rate Combinations')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()




## With the current dataset, it seems that Adam is most effective optimizer to find the best parameters.

## Now, we are doing more extensive hyper-parameter searching with dropout rates, weight decays, patience number for learning schedule, and hidden size for the model

In [None]:
import itertools

# Define hyperparameter ranges
dropout_rates = [0.2, 0.3, 0.5]
weight_decays = [0, 1e-4, 1e-3]
patiences = [2, 3, 5]
hidden_sizes = [64, 128, 256]  # Number of neurons in the hidden layers

# Create a dictionary to store results
hyperparam_results = {}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Iterate over all combinations of hyperparameters
for dropout, weight_decay, patience, hidden_size in itertools.product(dropout_rates, weight_decays, patiences, hidden_sizes):
    print(f"\nTraining with Dropout={dropout}, Weight Decay={weight_decay}, Patience={patience}, Hidden Size={hidden_size}")

    train_loader = DataLoader(work_train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(work_val_dataset, batch_size=64, shuffle=False)

    # Define a modified model with the current hyperparameters
    class ModifiedGradePredictor(nn.Module):
        def __init__(self, feature_sizes, output_dim, dropout_rate, hidden_size):
            super().__init__()
            self.feature_layers = nn.ModuleDict({
                col: nn.Sequential(
                    nn.Linear(size, hidden_size),
                    nn.ReLU(),
                    nn.Dropout(dropout_rate)
                )
                for col, size in feature_sizes.items()
            })
            self.fc = nn.Sequential(
                nn.Linear(len(feature_sizes) * hidden_size, hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate),
                nn.Linear(hidden_size, output_dim)
            )

        def forward(self, features):
            feature_outputs = [layer(features[col]) for col, layer in self.feature_layers.items()]
            combined = torch.cat(feature_outputs, dim=1)
            return F.softmax(self.fc(combined), dim=1)

    model = ModifiedGradePredictor(feature_sizes, output_dim, dropout, hidden_size).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=patience, factor=0.5)

    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    trigger_times = 0

    for epoch in range(10):
        model.train()
        train_loss = 0.0
        for features, targets in train_loader:
            features = {col: features[col].to(device) for col in features}
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in val_loader:
                features = {col: features[col].to(device) for col in features}
                targets = targets.to(device)
                outputs = model(features)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
        val_losses.append(val_loss / len(val_loader))

        scheduler.step(val_losses[-1])

        print(f"Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            trigger_times = 0
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print("Early stopping!")
                break

    # Save results for the current hyperparameter combination
    hyperparam_results[(dropout, weight_decay, patience, hidden_size)] = {
        'train_losses': train_losses,
        'val_losses': val_losses
    }

# Plot results
plt.figure(figsize=(12, 8))
for (dropout, weight_decay, patience, hidden_size), results in hyperparam_results.items():
    label = f"Dropout={dropout}, WD={weight_decay}, Pat={patience}, H={hidden_size}"
    plt.plot(results['val_losses'], label=label)
plt.title('Validation Loss vs. Epochs for Hyperparameter Combinations')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.show()



In [None]:
best_hyperparams = None
lowest_val_loss = float('inf')

for params, results in hyperparam_results.items():
    min_val_loss = min(results['val_losses'])
    if min_val_loss < lowest_val_loss:
        lowest_val_loss = min_val_loss
        best_hyperparams = params

best_dropout, best_weight_decay, best_patience, best_hidden_size = best_hyperparams
print(f"Best Hyperparameters: Dropout={best_dropout}, Weight Decay={best_weight_decay}, "
      f"Patience={best_patience}, Hidden Size={best_hidden_size}")
print(f"Lowest Validation Loss: {lowest_val_loss:.4f}")


## The best model found through paramter searching is, with dropout=0.5, weight decay=0, patience=5, and hidden size=256.

In [None]:
print("\nRe-training the model with the best hyperparameters...")

train_loader = DataLoader(work_train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(work_val_dataset, batch_size=64, shuffle=False)

class GradePredictor(nn.Module):
    def __init__(self, feature_sizes, output_dim, dropout_rate, hidden_size):
        super().__init__()
        super(GradePredictor, self).__init__()
        self.feature_layers = nn.ModuleDict({
            col: nn.Sequential(
                nn.Linear(size, hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            )
            for col, size in feature_sizes.items()
        })
        self.fc = nn.Sequential(
            nn.Linear(len(feature_sizes) * hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, output_dim)
        )

    def forward(self, features):
        # Process each feature independently
        feature_outputs = [layer(features[col]) for col, layer in self.feature_layers.items()]

        # Concatenate all feature outputs
        combined = torch.cat(feature_outputs, dim=1)
        logits = self.fc(combined)

        # Apply softmax and scale the results
        softmax_outputs = F.softmax(logits, dim=1)
        return softmax_outputs

best_model = GradePredictor(
    feature_sizes=feature_sizes,
    output_dim=output_dim,
    dropout_rate=best_dropout,
    hidden_size=best_hidden_size
).to(device)

optimizer = torch.optim.Adam(best_model.parameters(), lr=0.00001, weight_decay=best_weight_decay)
criterion = nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=best_patience, factor=0.5)

train_losses, val_losses = [], []
best_val_loss = float('inf')

for epoch in range(10):
    best_model.train()
    train_loss = 0.0
    for features, targets in train_loader:
        features = {col: features[col].to(device) for col in features}
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = best_model(features)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_losses.append(train_loss / len(train_loader))

    best_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for features, targets in val_loader:
            features = {col: features[col].to(device) for col in features}
            targets = targets.to(device)
            outputs = best_model(features)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
    val_losses.append(val_loss / len(val_loader))

    scheduler.step(val_losses[-1])

    print(f"Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")

print(f"\nFinal Validation Loss with Best Hyperparameters: {val_losses[-1]:.4f}")


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

test_loader = DataLoader(work_test_dataset, batch_size=64, shuffle=False)

best_model.eval()
test_loss = 0.0
all_predictions = []
all_targets = []

with torch.no_grad():
    for features, targets in test_loader:
        features = {col: features[col].to(device) for col in features}
        targets = targets.to(device)
        outputs = best_model(features)
        loss = criterion(outputs, targets)
        test_loss += loss.item()
        all_predictions.append(outputs.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

test_loss = test_loss / len(test_loader)
print(f"\nFinal Test Loss: {test_loss:.4f}")

all_predictions = np.vstack(all_predictions)
all_targets = np.vstack(all_targets)

mae = mean_absolute_error(all_targets, all_predictions)
rmse = mean_squared_error(all_targets, all_predictions, squared=False)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

category_mae = []
category_rmse = []

for i, grade in enumerate(grade_columns):
    mae_category = mean_absolute_error(all_targets[:, i], all_predictions[:, i])
    rmse_category = mean_squared_error(all_targets[:, i], all_predictions[:, i], squared=False)
    category_mae.append(mae_category)
    category_rmse.append(rmse_category)
    print(f"{grade} - MAE: {mae_category:.4f}, RMSE: {rmse_category:.4f}")


# Conclusion

With the batch size effect, it saw that faster convergence happen with the less number for the batch size, as it seems that the dataset is easy to predict.

Also, through the investigation with different optimizer, we saw the effectiveness of Adam optimizer, which seems superior to other optimizers, including SGD and RMSprop in finding the optimal parameters.

Lastly, with the extensive hyperparameter searching, we found the best model with dropout=0.5, weight decay=0, patience=5, and hidden size=256. Its best performance resulted in 0.0523 in MAE and 0.0685 in RMSE, which is significant improvement, compared to the previous baseline result although MAE is slightly worse than the baseline (0.0518, 0.0910 for MAE and RMSE, respectively). RMSE shows huge improvement in preventing the model itself from having outlier prediction.