In [None]:
from datetime import datetime

import numpy as np
from sklearn.preprocessing import StandardScaler

import dataset_utils as dataset
import torch as t

from ML_cup.pytorch import *
import os

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available")
    torch.set_default_device(torch.device("cuda"))
print(f"Using device: {torch.cuda.current_device()}")
print(f"Available cpu count: {os.cpu_count()}")

# load data
dev_data = dataset.load_dataset("../data/ML-CUP24-TR.csv")
blind_data = dataset.load_dataset("../data/ML-CUP24-TS.csv")
dev_data, X_scaler, y_scaler = dataset.rescale_dataset(dev_data)
blind_data = dataset.rescale_dataset(blind_data, X_scaler)

rand = 1741091302
np.random.seed(rand)

In [None]:
# Split the dev data into train and validation with k-fold cross validation
train_loaders, val_loaders, dev_loader, test_loader = dataset.torch_k_fold(dataset=dev_data, folds=5, batch_size=0)
# blind test
from ML_cup.dataset_utils import CupDataset

blind_loader = DataLoader(CupDataset(blind_data, device=torch.device('cuda')), batch_size=len(blind_data))

In [None]:
lr = 0.0001
weight_decay = 0.0005

def MLP() -> t.nn.Module:
    return t.nn.Sequential(
        t.nn.Linear(12, 200),
        t.nn.Tanh(),
        t.nn.Linear(200, 200),
        t.nn.Tanh(),
        t.nn.Linear(200, 3)
    )

# test different seeds in kfold
seeds = [rand / 10, rand / 2, rand, rand * 2, rand * 10]

train_mee_across_seeds = []
val_mee_across_seeds = []
test_mee_across_seeds = []

for seed in seeds:
    train_mee_seed = []
    val_mee_seed = []
    for train_loader, val_loader in zip(train_loaders, val_loaders):
        t.manual_seed(seed)
        model = MLP()
        optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        _, _, train_mee, val_mee, _, _ = torch_train(model, train_loader, optimizer, epochs=1000, val_loader=val_loader,
                                                     verbose=False, return_last=True, y_scaler=y_scaler,
                                                     random_seed=seed, patience=5, skip_plot_points=0,
                                                     clip=1.0)
        train_mee_seed.append(train_mee)
        val_mee_seed.append(val_mee)
    avg_train_mee = np.mean(train_mee_seed)
    avg_val_mee = np.mean(val_mee_seed)
    train_mee_across_seeds.append(avg_train_mee)
    val_mee_across_seeds.append(avg_val_mee)

print(f"Train MEE: {train_mee_across_seeds}, Mean: {np.mean(train_mee_across_seeds)}")
print(f"Validation MEE: {val_mee_across_seeds}, Mean: {np.mean(val_mee_across_seeds)}")

In [None]:
# pick the best seed, more details on the fold results
seed = rand / 2
train_mee_seed = []
val_mee_seed = []
for train_loader, val_loader in zip(train_loaders, val_loaders):
    t.manual_seed(seed)
    model = MLP()
    optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    _, _, train_mee, val_mee, _, _ = torch_train(model, train_loader, optimizer, epochs=1000, val_loader=val_loader,
                                                 verbose=False, return_last=True, y_scaler=y_scaler,
                                                 random_seed=seed, patience=5, skip_plot_points=0,
                                                 clip=1.0)
    train_mee_seed.append(train_mee)
    val_mee_seed.append(val_mee)

print(train_mee_seed)
print(val_mee_seed)

In [None]:
train_loader = train_loaders[3]
val_loader = val_loaders[3]
t.manual_seed(seed)
model = MLP()
optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
train_loss, val_loss, train_mee, val_mee, _, model = torch_train(model, train_loader, optimizer, epochs=1000,
                                                                 val_loader=val_loader,
                                                                 verbose=True, return_last=True, y_scaler=y_scaler,
                                                                 random_seed=seed, patience=5, skip_plot_points=100,
                                                                 clip=1.0)

print(f"Train MEE: {train_mee}, Validation MEE: {val_mee}")
print(f"Train Loss: {train_loss}, Validation Loss: {val_loss}")


In [None]:
# run model on test
print("Evaluating the model on the test set")
print(torch_predict(model, test_loader, y_scaler=y_scaler))

In [None]:
predictions = blind_test(model, blind_loader, seed, y_scaler=y_scaler)

In [None]:
# save predictions into csv
np.savetxt('predictions.csv', predictions, delimiter=',', fmt='%f')

In [None]:
import pandas as pd
import numpy as np

# Load the CSV file using numpy
predictions = np.loadtxt('predictions.csv', delimiter=',')

# Convert the numpy array to a pandas DataFrame
df = pd.DataFrame(predictions)

# Add an index column (you can name it 'Index')
df['Index'] = df.index
df = df[['Index'] + [col for col in df.columns if col != 'Index']]

# Save the new CSV with the index column
df.to_csv('MAG_ML-CUP24-TS.csv', index=False)