In [None]:
import glob
import gc
import polars as pl
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def polars_to_pandas(polars_df):
    return polars_df.to_pandas()

def optimize_memory(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

class LoadData:
    def __init__(self, file_paths, partition_ids=None):
        self.file_paths = file_paths
        self.partition_ids = partition_ids

    def load_and_concat(self):
        if self.partition_ids is not None:
            selected_files = [
                fp for fp in self.file_paths
                if any(f'partition_id={pid}' in fp for pid in self.partition_ids)
            ]
        else:
            selected_files = self.file_paths

        partitioned_data = [pl.read_parquet(file_path) for file_path in selected_files]
        df = pl.concat(partitioned_data, rechunk=False)
        
        del partitioned_data
        gc.collect()
        
        return df

# Example usage
partition_ids = [6, 7, 8, 9]
file_paths_all = sorted(glob.glob('Data/train.parquet/*/*.parquet'))
loader = LoadData(file_paths=file_paths_all, partition_ids=partition_ids)
df_selected = loader.load_and_concat()

del loader, file_paths_all
gc.collect()

# Convert Polars to Pandas
df_selected_pd = polars_to_pandas(df_selected)
del df_selected
gc.collect()

# Sort by date_id ascending, then by time_id ascending
df_selected_pd.sort_values(['date_id', 'time_id'], inplace=True)
df_selected_pd.reset_index(drop=True, inplace=True)
gc.collect()


In [None]:
cols_to_fill = [col for col in df_selected_pd.columns if col not in ['date_id', 'time_id', 'responder_6', 'weight']]

window_size = 31 * 850

for col in cols_to_fill:
    # Rolling mean over 26350 rows +- 15 days + current day
    rolling_mean_series = df_selected_pd[col].rolling(window=window_size, center=True, min_periods=1).mean()
    df_selected_pd[col] = df_selected_pd[col].fillna(rolling_mean_series)

df_selected_pd = df_selected_pd.fillna(0)
optimize_memory(df_selected_pd)
max_date_id = df_selected_pd['date_id'].max()
split_index = max_date_id - 120
df_train = df_selected_pd[df_selected_pd['date_id'] <= split_index]
df_val = df_selected_pd[df_selected_pd['date_id'] > split_index]
del df_selected_pd
gc.collect()

scaler = StandardScaler()
scaler.fit(df_train[cols_to_fill])
df_train[cols_to_fill] = scaler.transform(df_train[cols_to_fill])
df_val[cols_to_fill] = scaler.transform(df_val[cols_to_fill])

import joblib

joblib.dump(scaler, 'scaler.joblib')

In [None]:
import gc
import mlflow
import mlflow.pytorch
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

####################################
# 1) Dataset for On-The-Fly Sequences
####################################

class OnTheFlySequenceDataset(Dataset):
    def __init__(self, df, feature_cols, target_col, seq_length):
        """
        df: DataFrame (already sorted, scaled, imputed) with columns including feature_cols, target_col, weight.
        feature_cols: list of column names to use as features.
        target_col: the name of the target column (e.g. 'responder_6').
        seq_length: number of timesteps per sequence window.
        """
        self.df = df.reset_index(drop=True)
        self.feature_cols = feature_cols
        self.target_col = target_col
        self.seq_length = seq_length

        # The maximum valid index for starting a sequence of length seq_length
        self.max_index = len(self.df) - self.seq_length

    def __len__(self):
        return self.max_index

    def __getitem__(self, idx):
        x_slice = self.df.loc[idx : idx + self.seq_length - 1, self.feature_cols]
        y_slice = self.df.loc[idx + self.seq_length - 1, self.target_col]
        w_slice = self.df.loc[idx + self.seq_length - 1, 'weight']

        # Convert to torch tensors
        X = torch.tensor(x_slice.values, dtype=torch.float32)
        y = torch.tensor(y_slice, dtype=torch.float32)
        w = torch.tensor(w_slice, dtype=torch.float32)
        return X, y, w

####################################
# 2) Weighted MSE & Weighted R²
####################################

def weighted_mse_loss(y_pred, y_true, w):
    """
    Weighted MSE for training/backprop.
    y_pred, y_true, w all shape: [batch_size].
    """
    y_pred = y_pred.view(-1)
    mse = (y_true - y_pred) ** 2
    weighted_mse = w * mse
    return torch.mean(weighted_mse)

def compute_weighted_r2(model, loader, device):
    """
    Compute Weighted R² over an entire dataset (loader).
    R² = 1 - (Sum(w*(y - pred)^2) / Sum(w*y^2))
    """
    model.eval()
    numerator = 0.0
    denominator = 0.0
    with torch.no_grad():
        for batch_x, batch_y, batch_w in loader:
            batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)
            y_pred = model(batch_x).view(-1)
            numerator += torch.sum(batch_w * (batch_y - y_pred) ** 2).item()
            denominator += torch.sum(batch_w * (batch_y ** 2)).item()
    if denominator == 0:
        return 0.0
    return 1.0 - (numerator / denominator)

####################################
# 3) Define a Two-Layer LSTM Model
####################################

class TwoLayerLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1, dropout=0.2):
        super(TwoLayerLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=2,      # two LSTM layers
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # x: [batch_size, seq_length, input_dim]
        out, (hn, cn) = self.lstm(x)
        # Take the last time step
        out = out[:, -1, :]
        out = self.fc(out)  # [batch_size, 1]
        return out

####################################
# 4) Training Loop
####################################

def train_lstm(
    model, 
    train_loader, 
    val_loader,
    epochs=10,
    lr=1e-4,
    weight_decay=1e-5,
    device='cpu'
):
    """
    Train the LSTM using weighted MSE loss and track Weighted R² on validation.
    Logs metrics to MLflow each epoch.
    """
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    model.to(device)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for batch_x, batch_y, batch_w in train_loader:
            batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)

            optimizer.zero_grad()
            y_pred = model(batch_x).view(-1)  # shape: [batch_size]
            loss = weighted_mse_loss(y_pred, batch_y, batch_w)
            loss.backward()
            
            # Optional: Gradient clipping for stability if sequences are large
            # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            running_loss += loss.item() * len(batch_x)

        train_loss = running_loss / len(train_loader.dataset)

        # Compute validation Weighted R²
        val_weighted_r2 = compute_weighted_r2(model, val_loader, device)
        
        # Log metrics to MLflow
        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("val_weighted_r2", val_weighted_r2, step=epoch)

        print(f"Epoch [{epoch+1}/{epochs}], "
              f"Train Loss: {train_loss:.6f}, "
              f"Val Weighted R²: {val_weighted_r2:.6f}")

####################################
# 5) Putting It All Together w/ MLflow
####################################

def main(df_train, df_val, cols_to_fill):
    # Suppose df_train and df_val are your prepared DataFrames 
    # (already scaled, imputed, sorted, etc.)
    
    feature_cols = cols_to_fill  # your feature column names
    target_col = 'responder_6'
    seq_length = 100      # example
    batch_size = 64
    hidden_dim = 64
    dropout = 0.2
    lr = 1e-4
    weight_decay = 1e-5
    epochs = 10
    device = torch.device("cpu")

    # Create PyTorch Datasets
    train_dataset = OnTheFlySequenceDataset(df_train, feature_cols, target_col, seq_length)
    val_dataset   = OnTheFlySequenceDataset(df_val,   feature_cols, target_col, seq_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)

    # Define the model
    model = TwoLayerLSTM(
        input_dim=len(feature_cols), 
        hidden_dim=hidden_dim, 
        output_dim=1, 
        dropout=dropout
    )

    # Start MLflow run
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("Financial_LSTM_Experiment")
    with mlflow.start_run(run_name="TwoLayerLSTM_CustomLoss"):
        
        # Log hyperparameters to MLflow
        mlflow.log_param("seq_length", seq_length)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("hidden_dim", hidden_dim)
        mlflow.log_param("dropout", dropout)
        mlflow.log_param("learning_rate", lr)
        mlflow.log_param("weight_decay", weight_decay)
        mlflow.log_param("epochs", epochs)

        # Train
        train_lstm(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            epochs=epochs,
            lr=lr,
            weight_decay=weight_decay,
            device=device
        )

        # Final validation Weighted R²
        final_val_r2 = compute_weighted_r2(model, val_loader, device)
        mlflow.log_metric("final_val_weighted_r2", final_val_r2)
        
        print(f"Final Val Weighted R²: {final_val_r2:.6f}")

        # Log model to MLflow
        mlflow.pytorch.log_model(model, artifact_path="lstm_model")

        # Cleanup
        del df_train, df_val
        gc.collect()

if __name__ == "__main__":
    main(df_train, df_val, cols_to_fill)