In [None]:
import os
import pickle
import polars as pl
import numpy as np
import pandas as pd

In [None]:
def reduce_mem_usage(df, float16_as32=True):
    #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:#遍历每列的列名
        col_type = df[col].dtype#列名的type
        if col_type != object and str(col_type)!='category':#不是object也就是说这里处理的是数值类型的变量
            c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
            if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
                #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:#如果是浮点数类型.
                #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:#如果数据需要更高的精度可以选择float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)  
                #如果数值在float32的取值范围内，对它进行类型转换
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                #如果数值在float64的取值范围内，对它进行类型转换
                else:
                    df[col] = df[col].astype(np.float64)
    #计算一下结束后的内存
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #相比一开始的内存减少了百分之多少
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
feature_names = [f"feature_{i:02d}" for i in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)]
feature_cat = ["feature_09", "feature_10", "feature_11"]
feature_cont = [item for item in feature_names if item not in feature_cat]
label_name = 'responder_6'
weight_name = 'weight'

In [None]:
train_data_path = '/kaggle/input/data-create-create-lags/training.parquet'
valid_data_path = '/kaggle/input/data-create-create-lags/validation.parquet'

train_start_dt = 1100
valid_start_dt = 1638 # last 60 days #1670

train_df = pl.scan_parquet(train_data_path)
valid_df = pl.scan_parquet(valid_data_path)
df = pl.concat([train_df,valid_df])

train_df = df.filter(pl.col("date_id").gt(train_start_dt)) # .lt(valid_start_dt)
valid_df = df.filter(pl.col("date_id").ge(valid_start_dt))

# pytorch

In [None]:
import os
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning import (LightningDataModule, LightningModule, Trainer)
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer
from pytorch_lightning.loggers import WandbLogger
import wandb
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader



In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, accelerator):
        self.cont_features = torch.FloatTensor(df[feature_cont].values).to(accelerator)
        self.cat_features = torch.IntTensor(df[feature_cat].values).to(accelerator)

        self.labels = torch.FloatTensor(df[label_name].values).to(accelerator)
        self.weights = torch.FloatTensor(df[weight_name].values).to(accelerator)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        x_cont = self.cont_features[idx]
        x_cat = self.cat_features[idx]
        y = self.labels[idx]
        w = self.weights[idx]
        return x_cont, x_cat, y, w


class DataModule(LightningDataModule):
    def __init__(self, train_df, batch_size, valid_df=None, accelerator='cpu'):
        super().__init__()
        self.df = train_df
        self.batch_size = batch_size
        self.dates = train_df['date_id'].unique()
        self.accelerator = accelerator
        self.train_dataset = None
        self.valid_df = None
        if valid_df is not None:
            self.valid_df = valid_df
        self.val_dataset = None
        
    def setup(self, fold=0, N_fold=5, stage=None):
       # Split dataset
       if N_fold == 1:
           df_train = self.df
       else:
           selected_dates = [date for ii, date in enumerate(self.dates) if ii % N_fold != fold]
           df_train = self.df.loc[self.df['date_id'].isin(selected_dates)]
       
       self.train_dataset = CustomDataset(df_train, self.accelerator)
       if self.valid_df is not None:
           df_valid = self.valid_df
           self.val_dataset = CustomDataset(df_valid, self.accelerator)

    def train_dataloader(self, n_workers=0):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=n_workers)

    def val_dataloader(self, n_workers=0):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=n_workers)

In [None]:
# Custom R2 metric for validation
def r2_val(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return r2


class NN(LightningModule):
    def __init__(self, emb_dims, cat_dropout, cont_dim_original, cont_dim, cont_dropout, hidden_dims, dropouts, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()

        # cat
        self.emb = nn.ModuleList([nn.Embedding(x,y) for x,y in emb_dims])
        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.cat_dropout = nn.Dropout(cat_dropout)
        
        # cont 
        self.cont_batchnorm = nn.BatchNorm1d(cont_dim_original)
        self.cont_dropout = nn.Dropout(cont_dropout)
        self.cont_dense = nn.Linear(cont_dim_original, cont_dim)
        self.cont_activation = nn.ReLU()
        
        # concat  model 
        layers = []
        in_dim = cont_dim + no_of_embs
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))
            if i > 0:
                layers.append(nn.SiLU())
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))
            layers.append(nn.Linear(in_dim, hidden_dim))
            # layers.append(nn.ReLU())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, 1)) 
        layers.append(nn.Tanh())
        self.model = nn.Sequential(*layers)
        
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []

    def forward(self, x_cont, x_cat):
        ##cont data
        x_cont = self.cont_batchnorm(x_cont)
        x_cont = self.cont_dropout(x_cont)
        x_cont = self.cont_dense(x_cont)
        x_cont = self.cont_activation(x_cont)
        
        ## cat data part
        x_cat = [emb_layer(x_cat[:,i]) for i, emb_layer in enumerate(self.emb)]
        x_cat = torch.cat(x_cat,1)
        x_cat = self.cat_dropout(x_cat)
        
        ##concat
        x = torch.cat([x_cont,x_cat],1)
        
        return 5 * self.model(x).squeeze(-1)  

    def training_step(self, batch):
        x_cont,x_cat, y, w = batch
        y_hat = self(x_cont,x_cat)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  #
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x_cont.size(0))
        return loss

    def validation_step(self, batch):
        x_cont,x_cat, y, w = batch
        y_hat = self(x_cont,x_cat)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x_cont.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    def on_validation_epoch_end(self):
        """Calculate validation WRMSE at the end of the epoch."""
        y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
        if self.trainer.sanity_checking:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
        else:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
            weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
            # r2_val
            val_r_square = r2_val(y, prob, weights)
            self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return
        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")

# Create PyTorch Data Module

In [None]:
import polars as pl

category_mappings = {'feature_09': {2: 0, 4: 1, 9: 2, 11: 3, 12: 4, 14: 5, 15: 6, 25: 7, 26: 8, 30: 9, 34: 10, 42: 11, 44: 12, 46: 13, 49: 14, 50: 15, 57: 16, 64: 17, 68: 18, 70: 19, 81: 20, 82: 21},
 'feature_10': {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 10: 7, 12: 8},
 'feature_11': {9: 0, 11: 1, 13: 2, 16: 3, 24: 4, 25: 5, 34: 6, 40: 7, 48: 8, 50: 9, 59: 10, 62: 11, 63: 12, 66: 13,
  76: 14, 150: 15, 158: 16, 159: 17, 171: 18, 195: 19, 214: 20, 230: 21, 261: 22, 297: 23, 336: 24, 376: 25, 388: 26, 410: 27, 522: 28, 534: 29, 539: 30},
}

def encode_column(df, column, mapping):
    def encode_category(category):
        return mapping.get(category, -1)  
    
    return df.with_columns(
        pl.col(column).map_elements(encode_category, return_dtype=pl.Int16).alias(column)
    )

# 1.category encode
for col in feature_cat:
    train_df = encode_column(train_df, col, category_mappings[col])
    valid_df = encode_column(valid_df, col, category_mappings[col])

train_df = train_df.collect().to_pandas()
valid_df = valid_df.collect().to_pandas()

train_df = reduce_mem_usage(train_df, False)
valid_df = reduce_mem_usage(valid_df, False)

# train_df = pd.concat([train_df,valid_df],axis=0).reset_index(drop=True)
print(train_df.shape,valid_df.shape)

train_df[feature_names] = train_df[feature_names].fillna(0)
valid_df[feature_names] = valid_df[feature_names].fillna(0)

import pytorch_lightning as pl

In [None]:
class custom_args():
    def __init__(self):
        self.usegpu = True
        self.gpuid = 0
        self.seed = 42
        self.model = 'nn'
        self.use_wandb = False
        self.project = 'js-xs-nn-with-lags'
        self.dname = "./input_df/"
        self.loader_workers = 4
        self.bs = 8192
        self.lr = 1e-3
        self.weight_decay = 5e-4
        self.dropouts = [0.1, 0.1]
        self.n_hidden = [256, 256, 128]
        self.patience = 5
        self.max_epochs = 50
        self.N_fold = 1

        # cat
        self.n_cont_features = 85
        # self.n_cat_features = 3
        cat_cardinalities = [23, 10, 32]
        self.emb_dims = [(dim, min(50,(dim+1)//3)) for dim in cat_cardinalities]
        self.cat_dropout = 0.1

        # cont
        self.cont_dim = 512
        self.cont_dropout = 0.1
        

# Create Model and Training

In [None]:
args = custom_args()

# checking device
device = torch.device(f'cuda:{args.gpuid}' if torch.cuda.is_available() and args.usegpu else 'cpu')
accelerator = 'gpu' if torch.cuda.is_available() and args.usegpu else 'cpu'
loader_device = 'cpu'

data_module = DataModule(train_df, batch_size=args.bs, valid_df=valid_df, accelerator=loader_device)

import gc
del train_df,valid_df
gc.collect()


pl.seed_everything(args.seed)

# Obtain input dimension
# input_dim = data_module.train_dataset.features.shape[1]
# Initialize Model
for fold in range(args.N_fold):
    data_module.setup(fold, args.N_fold)
    model = NN(
        emb_dims = args.emb_dims, 
        cat_dropout = args.cat_dropout,
        cont_dim_original = args.n_cont_features,
        cont_dim = args.cont_dim,
        cont_dropout = args.cont_dropout,
        hidden_dims=args.n_hidden,
        dropouts=args.dropouts,
        lr=args.lr,
        weight_decay=args.weight_decay
    )
    # Initialize Logger
    if args.use_wandb:
        wandb_run = wandb.init(project=args.project, config=vars(args), reinit=True)
        logger = WandbLogger(experiment=wandb_run)
    else:
        logger = None
    # Initialize Callbacks
    # early_stopping = EarlyStopping('val_loss', patience=args.patience, mode='min', verbose=False)
    # checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1, verbose=False, filename=f"./models/nn_train_{train_start_dt}_valid_{valid_start_dt}.model") 
    
    early_stopping = EarlyStopping('val_r_square', patience=args.patience, mode='max', verbose=False)
    checkpoint_callback = ModelCheckpoint(monitor='val_r_square', mode='max', save_top_k=1, verbose=False, filename=f"./models/nn_fold{fold}_train{train_start_dt}_valid{valid_start_dt}.model") 
    
    timer = Timer()
    # Initialize Trainer
    trainer = Trainer(
        max_epochs=args.max_epochs,
        accelerator=accelerator,
        devices=[args.gpuid] if args.usegpu else None,
        logger=logger,
        callbacks=[early_stopping, checkpoint_callback, timer],
        enable_progress_bar=True
    )
    # Start Training
    trainer.fit(model, data_module.train_dataloader(args.loader_workers), data_module.val_dataloader(args.loader_workers))
    # You can find trained best model in your local path
    print(f'Fold-{fold} Training completed in {timer.time_elapsed("train"):.2f}s')