In [5]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from huggingface_hub import hf_hub_download
import warnings
warnings.filterwarnings("ignore")

!set HF_HUB_DISABLE_SYMLINKS_WARNING=true

# Load dataset
dataset = 'reg_cat/analcatdata_supreme.csv'
REPO_ID = "inria-soda/tabular-benchmark"
data = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename=dataset, repo_type="dataset")
)

X = data.drop('Log_exposure', axis=1)
y = data['Log_exposure']

# Standardize continuous features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class EnhancedLagLlama(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, num_layers, dropout):
        super(EnhancedLagLlama, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.decoder = nn.Linear(embed_dim, input_dim)
        self.fc = nn.Linear(input_dim, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Add a sequence dimension
        x = self.positional_encoding(x)
        x = self.dropout(x)
        x = self.transformer(x)
        x = x.squeeze(1)  # Remove the sequence dimension
        x = self.decoder(x)
        x = self.fc(x)
        return x

def pretrain(model, dataloader, optimizer, criterion, device, mask_prob=0.15):
    model.train()
    for X_batch, _ in dataloader:
        X_batch = X_batch.to(device)
        mask = (torch.rand(X_batch.shape) < mask_prob).float().to(device)
        masked_X_batch = X_batch * (1 - mask)
        
        optimizer.zero_grad()
        outputs = model(masked_X_batch)
        loss = criterion(outputs, X_batch)
        loss.backward()
        optimizer.step()

def objective(trial):
    # Define hyperparameters to tune
    num_heads = trial.suggest_int('num_heads', 2, 8)
    embed_dim = trial.suggest_int('embed_dim', num_heads * 8, num_heads * 32, step=num_heads * 8)
    num_layers = trial.suggest_int('num_layers', 2, 6)
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)

    # Initialize the model
    input_dim = X_train.shape[1]
    model = EnhancedLagLlama(input_dim, embed_dim, num_heads, num_layers, dropout)
    model.to(device)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Pre-training with feature masking
    pretrain(model, train_loader, optimizer, criterion, device)

    # Fine-tuning loop
    num_epochs = 20
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor.to(device))
        rmse = torch.sqrt(criterion(predictions, y_test_tensor.to(device))).item()
    
    return rmse

# Run the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("Best hyperparameters: ", study.best_params)
print("Best RMSE: ", study.best_value)


[I 2024-06-03 17:21:54,604] A new study created in memory with name: no-name-0dd996e0-5a23-44dd-bcad-a2869122b652
[I 2024-06-03 17:22:21,042] Trial 0 finished with value: 0.5153371691703796 and parameters: {'num_heads': 2, 'embed_dim': 16, 'num_layers': 5, 'dropout': 0.22177243325976959, 'learning_rate': 0.00035895319981955594}. Best is trial 0 with value: 0.5153371691703796.
[I 2024-06-03 17:23:42,168] Trial 1 finished with value: 0.7503073811531067 and parameters: {'num_heads': 5, 'embed_dim': 120, 'num_layers': 6, 'dropout': 0.2280790091688426, 'learning_rate': 1.8484811506031174e-05}. Best is trial 0 with value: 0.5153371691703796.
[I 2024-06-03 17:25:06,970] Trial 2 finished with value: 0.9310290217399597 and parameters: {'num_heads': 7, 'embed_dim': 168, 'num_layers': 5, 'dropout': 0.49503873138427207, 'learning_rate': 0.0006581859085801193}. Best is trial 0 with value: 0.5153371691703796.
[I 2024-06-03 17:25:42,156] Trial 3 finished with value: 0.7204582691192627 and parameters:

Best hyperparameters:  {'num_heads': 4, 'embed_dim': 64, 'num_layers': 5, 'dropout': 0.2565767626966763, 'learning_rate': 4.5915253774484096e-05}
Best RMSE:  0.26530590653419495
