In [None]:
""" Import Package """
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.decomposition import PCA
import os
import random
from tqdm import tqdm
import wandb
from sklearn.preprocessing import PolynomialFeatures
import time
import copy
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import _LRScheduler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations

""" Global variables """
DATA = "./data"
TRAIN_DATA_PATH = f"{DATA}/training.csv"
TEST_DATA_PATH = f"{DATA}/test.csv"
OUTPUT_PREDICTION = f"{DATA}/neural_network_predictions.csv"

First, we want to find the relationthips among the features in the dataset in order to help us decide how to utilize the combination of features.

In [None]:
""" Load dataset """
class SearchEngineDataLoader(Dataset):
    def __init__(self, file_path: str, train: bool, features: list):
        self.data = pd.read_csv(file_path)
        self.features = self.data[features].values
        if train:
            self.labels = self.data['relevance'].values
        else:
            self.labels = None
        
        self.features = self.data[features].values
    
    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        if self.labels is not None:
            label = torch.tensor(self.labels[idx], dtype=torch.float32).unsqueeze(0)
            return features, label
        else:
            return features


def get_dataset(train_file_path: str,
                test_file_path: str,
                features: list):
    """ Create Dataset """
    train_dataset = SearchEngineDataLoader(train_file_path,
                                           train=True,
                                           features=features)
    test_dataset = SearchEngineDataLoader(test_file_path,
                                          train=False,
                                          features=features)
    return train_dataset, test_dataset


def get_dataloader(train_dataset: SearchEngineDataLoader,
                   test_dataset: SearchEngineDataLoader,
                   batch_size: int, 
                   val_proportion: float = 0,
                   pin_memory: bool = True,
                   shuffle: bool = True,
                   seed: int = 1) -> tuple[DataLoader, DataLoader, int, int]:
    """ Create Dataloader and return in_channels and num_classes """
    in_channels = train_dataset.features.shape[1]
    num_classes = 1  # Assuming binary classification

    # Split into train and validation sets if val_proportion > 0
    if val_proportion > 0:
        val_size = int(len(train_dataset) * val_proportion)
        train_size = len(train_dataset) - val_size
        generator = torch.Generator().manual_seed(seed)
        train_dataset, val_dataset = random_split(train_dataset, 
                                                  [train_size, val_size], 
                                                  generator=generator)
    else:
        train_dataset = train_dataset
        val_dataset = None
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, 
                              batch_size=batch_size, 
                              shuffle=shuffle, 
                              num_workers=0, 
                              pin_memory=pin_memory)
    
    if val_dataset is not None:
        val_loader = DataLoader(val_dataset, 
                                batch_size=batch_size, 
                                shuffle=False, 
                                num_workers=0, 
                                pin_memory=pin_memory)
    else:
        val_loader = None
    
    test_loader = DataLoader(test_dataset, 
                             batch_size=batch_size, 
                             shuffle=False, 
                             num_workers=0, 
                             pin_memory=pin_memory)

    return train_loader, val_loader, test_loader, in_channels, num_classes


features = ['query_length', 'is_homepage', 'sig1', 'sig2', 
            'sig3', 'sig4', 'sig6', 'sig7', 'sig8']
batch_size = 512

train_dataset, test_dataset = get_dataset(TRAIN_DATA_PATH,
                                          TEST_DATA_PATH,
                                          features)

In [None]:
# standardize both dataset 
scaler = StandardScaler()
train_dataset.features = scaler.fit_transform(train_dataset.features)
test_dataset.features = scaler.transform(test_dataset.features)

In [None]:
""" Get Dataloader """
train_loader, val_loader, test_loader, in_channels, num_class = get_dataloader(train_dataset=train_dataset,
                                                                               test_dataset=test_dataset,
                                                                               batch_size=batch_size,
                                                                               val_proportion=0.2,
                                                                               pin_memory=True,
                                                                               shuffle=True,
                                                                               seed=1,)
print(f"Sample numbers: {train_dataset.features.shape[0]}")
print(f"Feature numbers: {train_dataset.features.shape[1]}")

In [None]:
""" Modeling using Neural Network """
def torch_set_random_seed(seed: int = 1) -> None:
    """ Set random seed for reproducible usage """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


def train(model: nn.Module, 
          train_loader: DataLoader, 
          loss_function: nn.Module,
          optimizer: optim.Optimizer,
          epoch: int,
          device: str) -> float:
    """ Train model and save using early stop on test dataset """
    model.train()
    train_loss = 0.0
    for features, labels in train_loader:
        features = features.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    return train_loss


@torch.no_grad()
def evaluate(model: nn.Module,
             eval_loader: DataLoader, 
             loss_function: nn.Module, 
             threshold: float,
             device: str) -> tuple[float, float]:
    """ Evaluate model """
    model.eval()
    correct = 0.0
    total = 0
    total_loss = 0.0

    with torch.no_grad():
        for features, labels in eval_loader:
            features = features.to(device)
            labels = labels.to(device)

            outputs = model(features)
            loss = loss_function(outputs, labels)
            total_loss += loss.item()
            
            predicted = (torch.sigmoid(outputs) > threshold).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    average_loss = total_loss / len(eval_loader)
    return accuracy, average_loss


""" Train model on dataloader """
min_lr = 1e-4
random_seed = 1
split_seed = 1

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Running on device: {device}")

output_dir = "output_models"
overall_best_acc = 0
optimal_weight_decay = 0
optimal_initial_lr = 0
optimal_total_epoch = 0

for weight_decay in [1e-2, 1e-3, 1e-4]:
    for initial_lr in [1e-1, 5e-2, 1e-2]:
        for total_epoch in [10, 20]:
            best_acc = 0.0
            for split_seed in [1]:
                torch_set_random_seed(random_seed)
                hyperparams_config = {
                    "epoch": total_epoch,
                    "initial_lr": initial_lr,
                    "min_lr": min_lr,
                    "random_seed": random_seed
                }
                wandb.init(
                    project=f"STATS-Project-resnet18-final",
                    name=f"{int(time.time())}",
                    id=str(int(time.time())),
                    config=hyperparams_config,
                    mode='online'
                )

                from models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152
                from models.vgg import vgg11, vgg13, vgg16, vgg19
                model = resnet50(in_channels, num_class).to(device)

                loss_function = nn.BCEWithLogitsLoss()
                optimizer = optim.SGD(model.parameters(), 
                                      lr=initial_lr, 
                                      momentum=0.9, 
                                      weight_decay=weight_decay)
                lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 
                                                                    T_max=total_epoch, 
                                                                    eta_min= min_lr,
                                                                    last_epoch=-1)

                cur_best_acc = 0
                with tqdm(total=total_epoch, desc=f'Training', unit='epoch') as pbar:
                    for epoch in range(1, total_epoch + 1):
                        train_loss = train(model, 
                                            train_loader, 
                                            loss_function, 
                                            optimizer,
                                            epoch,
                                            device)
                        top1_acc, eval_loss = evaluate(model, 
                                                    val_loader, 
                                                    loss_function, 
                                                    0.5,
                                                    device)
                        lr_scheduler.step()

                        if cur_best_acc < top1_acc:
                            cur_best_acc = top1_acc
                            if best_acc < cur_best_acc:
                                best_acc = cur_best_acc
                                os.makedirs(f"{output_dir}", exist_ok=True)
                                output_pth = f"{output_dir}/best_model.pth"
                                torch.save(model, output_pth)
                        
                        for param_group in optimizer.param_groups:
                            lr = param_group['lr']
                        
                        wandb.log({"epoch": epoch, "train_loss": train_loss, 'lr': lr,
                                   "top1_acc": top1_acc, "eval_loss": eval_loss,
                                   "best top1 acc": cur_best_acc})
                        
                        pbar.set_postfix({'Train loss': train_loss, 
                                          'overall_best_acc': overall_best_acc, 
                                          'Best top1 acc': cur_best_acc, 
                                          'Top1 acc': top1_acc})
                        pbar.update(1)
                
                wandb.finish()
                print(f"Seed {split_seed} has best_acc: {cur_best_acc}")
            if overall_best_acc <= best_acc:
                overall_best_acc = best_acc
                optimal_weight_decay = weight_decay
                optimal_initial_lr = initial_lr
                optimal_total_epoch = total_epoch

print(f"Optimal validation accuracy: {overall_best_acc}")
print(f"Optimal weight decayL: {optimal_weight_decay}")
print(f"Optimal initial learning rate: {optimal_initial_lr}")
print(f"Optimal total epoch: {optimal_total_epoch}")


In [None]:
""" Predict on test dataset using the best model """
@torch.no_grad()
def predict_and_save_results(model, test_loader, output_file, threshold):
    model.eval() 
    results = []

    with torch.no_grad():
        for idx, features in enumerate(test_loader):
            features = features.to(device)
            outputs = model(features)
            predictions = (torch.sigmoid(outputs) > threshold).float()

            for idx2, prediction in enumerate(predictions):
                sample_id = str(int(test_loader.dataset.data.iloc[idx*batch_size+idx2]['query_id'])) + str(int(test_loader.dataset.data.iloc[idx*batch_size+idx2]['url_id']))
                results.append({'id': sample_id, 'relevance': int(prediction)})

    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file, index=False)

best_model = torch.load(f"{output_dir}/best_model.pth").to(device)
predict_and_save_results(best_model, test_loader, OUTPUT_PREDICTION, 0.5)
print(f"Predictions saved at {OUTPUT_PREDICTION}")