In [None]:
!pip install tab-transformer-pytorch -q --no-index --find-links=/kaggle/input/jane-street-import/tab-transformer-pytorch

In [None]:
!pip install hyper_connections

# load Data

In [None]:
import os,gc
import pickle
import polars as pl
import numpy as np
import pandas as pd
import joblib

In [None]:
def reduce_mem_usage(df, float16_as32=True):
    #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:#遍历每列的列名
        col_type = df[col].dtype#列名的type
        if col_type != object and str(col_type)!='category':#不是object也就是说这里处理的是数值类型的变量
            c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
            if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
                #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:#如果是浮点数类型.
                #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:#如果数据需要更高的精度可以选择float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)  
                #如果数值在float32的取值范围内，对它进行类型转换
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                #如果数值在float64的取值范围内，对它进行类型转换
                else:
                    df[col] = df[col].astype(np.float64)
    #计算一下结束后的内存
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #相比一开始的内存减少了百分之多少
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
class CONFIG():
    def __init__(self):
        self.train_data_path = '/kaggle/input/data-create-create-lags/training.parquet'
        self.valid_data_path = '/kaggle/input/data-create-create-lags/validation.parquet'
        self.feature_names = [f"feature_{i:02d}" for i in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)]
        self.label_name = 'responder_6'
        self.weight_name = 'weight'
        self.feature_cat = ["feature_09", "feature_10", "feature_11"]
        self.feature_cont = [item for item in self.feature_names if item not in self.feature_cat]
        self.train_start_dt = 1100
my_config = CONFIG()

In [None]:
# train_df = pl.scan_parquet(my_config.train_data_path)
train_df = pl.scan_parquet(my_config.train_data_path).filter(pl.col("date_id").ge(my_config.train_start_dt))
valid_df = pl.scan_parquet(my_config.valid_data_path)

data_stats = joblib.load('/kaggle/input/my-own-js/data_stats.pkl')
means = data_stats['mean']
stds = data_stats['std']

def standardize(df, feature_cols, means, stds):
    return df.with_columns([
        ((pl.col(col) - means[col]) / stds[col]).alias(col) for col in feature_cols
    ])

category_mappings = {'feature_09': {2: 0, 4: 1, 9: 2, 11: 3, 12: 4, 14: 5, 15: 6, 25: 7, 26: 8, 30: 9, 34: 10, 42: 11, 44: 12, 46: 13, 49: 14, 50: 15, 57: 16, 64: 17, 68: 18, 70: 19, 81: 20, 82: 21},
 'feature_10': {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 10: 7, 12: 8},
 'feature_11': {9: 0, 11: 1, 13: 2, 16: 3, 24: 4, 25: 5, 34: 6, 40: 7, 48: 8, 50: 9, 59: 10, 62: 11, 63: 12, 66: 13,
  76: 14, 150: 15, 158: 16, 159: 17, 171: 18, 195: 19, 214: 20, 230: 21, 261: 22, 297: 23, 336: 24, 376: 25, 388: 26, 410: 27, 522: 28, 534: 29, 539: 30},
}

def encode_column(df, column, mapping):
    def encode_category(category):
        return mapping.get(category, -1)  
    
    return df.with_columns(
        pl.col(column).map_elements(encode_category, return_dtype=pl.Int16).alias(column)
    )

# 1.category encode
for col in my_config.feature_cat:
    train_df = encode_column(train_df, col, category_mappings[col])
    valid_df = encode_column(valid_df, col, category_mappings[col])

# 2.standard
train_df = standardize(train_df, my_config.feature_cont, means, stds)
valid_df = standardize(valid_df, my_config.feature_cont, means, stds)

# 3.fillna 0

# train_df = train_df.fill_nan(0)
# valid_df = valid_df.fill_nan(0)

print(3)
df = train_df.collect().to_pandas()
df = reduce_mem_usage(df, False)

valid_df = valid_df.collect().to_pandas()
valid_df = reduce_mem_usage(valid_df, False)
print(3)
df[my_config.feature_names] = df[my_config.feature_names].fillna(0)
valid_df[my_config.feature_names] = valid_df[my_config.feature_names].fillna(0)
df = pd.concat([df, valid_df]).reset_index(drop=True)# A trick to boost LB from 0.0045->0.005

# Training my_configurations

In [None]:
import os
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning import (LightningDataModule, LightningModule, Trainer)
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer
from pytorch_lightning.loggers import WandbLogger
import wandb
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tab_transformer_pytorch import FTTransformer


class custom_args():
    def __init__(self):
        self.usegpu = True
        self.gpuid = 0
        self.seed = 42
        self.model = 'nn'
        self.use_wandb = False
        self.project = 'js-tabm-with-lags'
        self.dname = "./input_df/"
        self.loader_workers = 4   
        
        self.bs = 4096
        self.lr = 1e-3
        self.weight_decay = 2e-4
        self.n_cont_features = 85
        self.n_cat_features = 3
        self.cat_cardinalities = [23, 10, 32]
        self.patience = 5
        self.max_epochs = 10
        self.N_fold = 3

        # dim = 32,                           # dimension, paper set at 32
        # dim_out = 1,                        # binary prediction, but could be anything
        # depth = 6,                          # depth, paper recommended 6
        # heads = 8,                          # heads, paper recommends 8
        # attn_dropout = 0.1,                 # post-attention dropout
        # ff_dropout = 0.1  

# PyTorch Data Module Definition

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, accelerator, num_features, cat_features, label_name,weight_name):
        self.num_features = torch.FloatTensor(df[num_features].values).to(accelerator)
        self.cat_features = torch.IntTensor(df[cat_features].values).to(accelerator)

        self.labels = torch.FloatTensor(df[label_name].values).to(accelerator)
        self.weights = torch.FloatTensor(df[weight_name].values).to(accelerator)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        x_cont = self.num_features[idx]
        x_cat = self.cat_features[idx]
        y = self.labels[idx]
        w = self.weights[idx]
        return x_cont, x_cat, y, w


class DataModule(LightningDataModule):
    def __init__(self, train_df, batch_size, valid_df=None, accelerator='cpu',num_features=[], cat_features=[], label_name='repsond6',weight_name='weight'):
        super().__init__()
        self.df = train_df
        self.batch_size = batch_size
        self.dates = self.df['date_id'].unique()
        self.accelerator = accelerator
        self.train_dataset = None
        self.valid_df = None
        if valid_df is not None:
            self.valid_df = valid_df
        self.val_dataset = None
        self.num_features = num_features
        self.cat_features = cat_features
        self.label_name = label_name
        self.weight_name = weight_name

    def setup(self, fold=0, N_fold=5, stage=None):
        # Split dataset
        selected_dates = [date for ii, date in enumerate(self.dates) if ii % N_fold != fold]
        df_train = self.df.loc[self.df['date_id'].isin(selected_dates)]
        self.train_dataset = CustomDataset(df_train, self.accelerator,self.num_features,self.cat_features,self.label_name,self.weight_name)
        if self.valid_df is not None:
            df_valid = self.valid_df
            self.val_dataset = CustomDataset(df_valid, self.accelerator,self.num_features,self.cat_features,self.label_name,self.weight_name)

    def train_dataloader(self, n_workers=0):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=n_workers)

    def val_dataloader(self, n_workers=0):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=n_workers)
    

# NN Model Definition¶

In [None]:
class R2Loss(nn.Module):
    def __init__(self):
        super(R2Loss, self).__init__()

    def forward(self, y_pred, y_true):
        mse_loss = torch.sum((y_pred - y_true) ** 2)
        var_y = torch.sum(y_true ** 2)
        loss = mse_loss / (var_y + 1e-38)
        return loss

# Custom R2 metric for validation
def r2_val(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return r2


class FTTransformerModel(LightningModule):
    def __init__(self, n_cont_features, cat_cardinalities, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        self.model = FTTransformer(
                categories = cat_cardinalities,      # tuple containing the number of unique values within each category
                num_continuous = n_cont_features,                # number of continuous values
                dim = 8,                           # dimension, paper set at 32
                dim_out = 1,                        # binary prediction, but could be anything
                depth = 3,                          # depth, paper recommended 6
                heads = 2,                          # heads, paper recommends 8
                attn_dropout = 0.2,                 # post-attention dropout
                ff_dropout = 0.2                    # feed forward dropout
            )
        self.lr = lr
        self.weight_decay = weight_decay
        self.training_step_outputs = []
        self.validation_step_outputs = []
        # self.loss_fn = F.mse_loss()
        # self.loss_fn = R2Loss()
        # self.loss_fn = weighted_mse_loss

    def forward(self, x_cont, x_cat):
        return self.model(x_cat, x_cont).squeeze(-1)
        # return self.model(x_cont, x_cat).squeeze(-1)

    def training_step(self, batch):
        x_cont, x_cat, y, w = batch
        # x_cont = x_cont + torch.randn_like(x_cont) * 0.01
        y_hat = self(x_cont, x_cat)
        # loss = self.loss_fn(y_hat.flatten(0, 1), y.repeat_interleave(self.k), w_y.repeat_interleave(self.k))
        # loss = self.loss_fn(y_hat.flatten(0, 1), y.repeat_interleave(self.k))
        
        loss = F.mse_loss(y_hat, y, reduction='none') * w  #
        loss = loss.mean()
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=x_cont.size(0))
        self.training_step_outputs.append((y_hat , y, w))
        return loss

    def validation_step(self, batch):
        x_cont, x_cat, y, w = batch
        # x_cont = x_cont + torch.randn_like(x_cont)
        y_hat = self(x_cont, x_cat)
        # loss = self.loss_fn(y_hat.flatten(0, 1), y.repeat_interleave(self.k), w_y.repeat_interleave(self.k))
        # loss = self.loss_fn(y_hat.flatten(0, 1), y.repeat_interleave(self.k))
        loss = F.mse_loss(y_hat, y, reduction='none') * w  #
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, batch_size=x_cont.size(0))
        self.validation_step_outputs.append((y_hat , y, w))
        return loss

    def on_validation_epoch_end(self):
        """Calculate validation WRMSE at the end of the epoch."""
        y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
        if self.trainer.sanity_checking:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
        else:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
            weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
            # r2_val
            val_r_square = r2_val(y, prob, weights)
            self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.validation_step_outputs.clear()

    # def my_configure_optimizers(self):
    #     optimizer = torch.optim.AdamW(make_parameter_groups(self.model), lr=self.lr, weight_decay=self.weight_decay)
    #     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5,
    #                                                            verbose=True)
    #     return {
    #         'optimizer': optimizer,
    #         'lr_scheduler': {
    #             'scheduler': scheduler,
    #             'monitor': 'val_r_square',
    #         }
    #     }

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return

        y = torch.cat([x[1] for x in self.training_step_outputs]).cpu().numpy()
        prob = torch.cat([x[0] for x in self.training_step_outputs]).detach().cpu().numpy()
        weights = torch.cat([x[2] for x in self.training_step_outputs]).cpu().numpy()
        # r2_training
        train_r_square = r2_val(y, prob, weights)
        self.log("train_r_square", train_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.training_step_outputs.clear()

        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")

# Training

In [None]:
args = custom_args()
# checking device
device = torch.device(f'cuda:{args.gpuid}' if torch.cuda.is_available() and args.usegpu else 'cpu')
accelerator = 'gpu' if torch.cuda.is_available() and args.usegpu else 'cpu'
loader_device = 'cpu'


# Initialize Data Module
data_module = DataModule(df, batch_size=args.bs, valid_df=valid_df, accelerator=loader_device,num_features=my_config.feature_cont, cat_features=my_config.feature_cat, label_name=my_config.label_name,weight_name=my_config.weight_name)

del df
del valid_df
gc.collect()

In [None]:

pl.seed_everything(args.seed)
for fold in range(args.N_fold):
    data_module.setup(fold, args.N_fold)
    # Obtain input dimension
    # input_dim = data_module.train_dataset.features.shape[1]
    # Initialize Model
    model = FTTransformerModel(
        n_cont_features = args.n_cont_features, 
        cat_cardinalities = args.cat_cardinalities,
        lr=args.lr,
        weight_decay=args.weight_decay
    )
    # Initialize Logger
    if args.use_wandb:
        wandb_run = wandb.init(project=args.project, my_config=vars(args), reinit=True)
        logger = WandbLogger(experiment=wandb_run)
    else:
        logger = None
    # Initialize Callbacks
    early_stopping = EarlyStopping('val_r_square', patience=args.patience, mode='max', verbose=False)
    checkpoint_callback = ModelCheckpoint(monitor='val_r_square', mode='max', save_top_k=1, verbose=False, filename=f"./models/nn_{fold}.model") 
    timer = Timer()
    # Initialize Trainer
    trainer = Trainer(
        max_epochs=args.max_epochs,
        accelerator=accelerator,
        devices=[args.gpuid] if args.usegpu else None,
        logger=logger,
        callbacks=[early_stopping, checkpoint_callback, timer],
        enable_progress_bar=True
    )
    # Start Training
    trainer.fit(model, data_module.train_dataloader(args.loader_workers), data_module.val_dataloader(args.loader_workers))
    # You can find trained best model in your local path
    print(f'Fold-{fold} Training completed in {timer.time_elapsed("train"):.2f}s')