In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [2]:
class CONFIG:
    seed = 42
    target_col = "responder_6"
    feature_cols = ["symbol_id", "time_id"] \
        + [f"feature_{idx:02d}" for idx in range(79)] \
        + [f"responder_{idx}_lag_1" for idx in range(9)]
    label_col = "weight"
    categorical_cols = []

In [3]:
##del feature
col = ['feature_09', 'feature_10', 'feature_11', 'feature_63', 'feature_16', 'feature_40', 'feature_64', 'feature_71', 'feature_43']
for c in col:
    CONFIG.feature_cols.remove(c)

In [4]:
df = pl.scan_parquet("./training.parquet").collect().to_pandas()
valid = pl.scan_parquet("./validation.parquet").collect().to_pandas()
df = pd.concat([df, valid]).reset_index(drop=True)
df.shape, valid.shape

((10955824, 104), (527560, 104))

In [5]:
X_train = df[ CONFIG.feature_cols ]
y_train = df[ CONFIG.target_col ]
w_train = df[ "weight" ]
X_valid = valid[ CONFIG.feature_cols ]
y_valid = valid[ CONFIG.target_col ]
w_valid = valid[ "weight" ]

X_train.shape, y_train.shape, w_train.shape, X_valid.shape, y_valid.shape, w_valid.shape

((10955824, 81), (10955824,), (10955824,), (527560, 81), (527560,), (527560,))

In [6]:
import os
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning import (LightningDataModule, LightningModule, Trainer)
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer
from pytorch_lightning.loggers import WandbLogger
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


class custom_args():
    def __init__(self):
        self.usegpu = True
        self.gpuid = 0
        self.seed = 42
        self.model = 'nn'
        self.project = 'js-xs-nn-with-lags'
        self.dname = "./input_df/"
        self.loader_workers = 0
        self.bs = 8192
        self.lr = 1e-3
        self.weight_decay = 5e-4
        self.dropouts = [0.15, 0.15]
        self.n_hidden = [512, 512, 256]
        self.patience = 10
        self.max_epochs = 2000
        self.N_fold = 5


my_args = custom_args()

In [16]:
class CustomDataset(Dataset):
    def __init__(self, df, accelerator):
        self.features = torch.FloatTensor(df[CONFIG.feature_cols].values).to(accelerator)
        self.labels = torch.FloatTensor(df[CONFIG.target_col].values).to(accelerator)
        self.weights = torch.FloatTensor(df[CONFIG.label_col].values).to(accelerator)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        x = self.features[idx]
        y = self.labels[idx]
        w = self.weights[idx]
        return x, y, w


class DataModule(LightningDataModule):
    def __init__(self, train_df, batch_size, valid_df=None, accelerator='cpu'):
        super().__init__()
        self.df = train_df
        self.batch_size = batch_size
        self.dates = self.df['date_id'].unique()
        self.accelerator = accelerator
        self.train_dataset = None
        self.valid_df = None
        if valid_df is not None:
            self.valid_df = valid_df
        self.val_dataset = None

    def setup(self, fold=0, N_fold=5, stage=None):
        # Split dataset
        #selected_dates = [date for ii, date in enumerate(self.dates) if ii % N_fold != fold]
        selected_dates = self.dates[self.dates % N_fold != 4].tolist()
        print(selected_dates)
        df_train = self.df.loc[self.df['date_id'].isin(selected_dates)]
        #df_train = df
        self.train_dataset = CustomDataset(df_train, self.accelerator)
        if self.valid_df is not None:
            df_valid = self.valid_df
            self.val_dataset = CustomDataset(df_valid, self.accelerator)

    def train_dataloader(self, n_workers=0):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=n_workers)

    def val_dataloader(self, n_workers=0):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=n_workers)

In [17]:
# Custom R2 metric for validation
def r2_val(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return r2


class NN(LightningModule):
    def __init__(self, input_dim, hidden_dims, dropouts, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        layers = []
        in_dim = input_dim
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))
            if i > 0:
                layers.append(nn.SiLU())
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))
            layers.append(nn.Linear(in_dim, hidden_dim))
            # layers.append(nn.ReLU())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, 1)) 
        layers.append(nn.Tanh())
        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []

    def forward(self, x):
        return 5 * self.model(x).squeeze(-1)  

    def training_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  #
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        return loss

    def validation_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    def on_validation_epoch_end(self):
        """Calculate validation WRMSE at the end of the epoch."""
        y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
        if self.trainer.sanity_checking:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
        else:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
            weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
            # r2_val
            val_r_square = r2_val(y, prob, weights)
            self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return
        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")

In [18]:
args = my_args

# checking device
device = torch.device(f'cuda:{args.gpuid}' if torch.cuda.is_available() and args.usegpu else 'cpu')
accelerator = 'gpu' if torch.cuda.is_available() and args.usegpu else 'cpu'
loader_device = 'cpu'
print(accelerator)

# Initialize Data Module

df[CONFIG.feature_cols] = df[CONFIG.feature_cols].fillna(method = 'ffill').fillna(0)
valid[CONFIG.feature_cols] = valid[CONFIG.feature_cols].fillna(method = 'ffill').fillna(0)

#df[CONFIG.feature_cols] = df.groupby("symbol_id")[CONFIG.feature_cols].transform(lambda x : x.interpolate())
data_module = DataModule(df, batch_size=args.bs, valid_df=valid, accelerator=loader_device)

gpu


In [19]:
pl.seed_everything(args.seed)
for fold in range(args.N_fold):
    data_module.setup(fold, args.N_fold)
    # Obtain input dimension
    input_dim = data_module.train_dataset.features.shape[1]
    # Initialize Model
    model = NN(
        input_dim=input_dim,
        hidden_dims=args.n_hidden,
        dropouts=args.dropouts,
        lr=args.lr,
        weight_decay=args.weight_decay
    )
    logger = None
    # Initialize Callbacks
    early_stopping = EarlyStopping('val_loss', patience=args.patience, mode='min', verbose=False)
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1, verbose=False, filename=f"./models/nn_{fold}.model") 
    timer = Timer()
    # Initialize Trainer
    trainer = Trainer(
        max_epochs=args.max_epochs,
        accelerator=accelerator,
        devices=[args.gpuid] if args.usegpu else 'auto',
        logger=logger,
        callbacks=[early_stopping, checkpoint_callback, timer],
        enable_progress_bar=True
    )
    # Start Training
    trainer.fit(model, data_module.train_dataloader(n_workers=0), data_module.val_dataloader(n_workers=0))
    # You can find trained best model in your local path
    print(f'Fold-{fold} Training completed in {timer.time_elapsed("train"):.2f}s')

Seed set to 42


[1401, 1402, 1403, 1405, 1406, 1407, 1408, 1410, 1411, 1412, 1413, 1415, 1416, 1417, 1418, 1420, 1421, 1422, 1423, 1425, 1426, 1427, 1428, 1430, 1431, 1432, 1433, 1435, 1436, 1437, 1438, 1440, 1441, 1442, 1443, 1445, 1446, 1447, 1448, 1450, 1451, 1452, 1453, 1455, 1456, 1457, 1458, 1460, 1461, 1462, 1463, 1465, 1466, 1467, 1468, 1470, 1471, 1472, 1473, 1475, 1476, 1477, 1478, 1480, 1481, 1482, 1483, 1485, 1486, 1487, 1488, 1490, 1491, 1492, 1493, 1495, 1496, 1497, 1498, 1500, 1501, 1502, 1503, 1505, 1506, 1507, 1508, 1510, 1511, 1512, 1513, 1515, 1516, 1517, 1518, 1520, 1521, 1522, 1523, 1525, 1526, 1527, 1528, 1530, 1531, 1532, 1533, 1535, 1536, 1537, 1538, 1540, 1541, 1542, 1543, 1545, 1546, 1547, 1548, 1550, 1551, 1552, 1553, 1555, 1556, 1557, 1558, 1560, 1561, 1562, 1563, 1565, 1566, 1567, 1568, 1570, 1571, 1572, 1573, 1575, 1576, 1577, 1578, 1580, 1581, 1582, 1583, 1585, 1586, 1587, 1588, 1590, 1591, 1592, 1593, 1595, 1596, 1597, 1598, 1600, 1601, 1602, 1603, 1605, 1606, 1607, 160

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 438 K  | train
---------------------------------------------
438 K     Trainable params
0         Non-trainable params
438 K     Total params
1.754     Total estimated model params size (MB)
13        Modules in train mode
0         Modules in eval mode


Sanity Checking: |                                                                               | 0/? [00:00<…

Training: |                                                                                      | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Epoch 0: {'val_loss': '1.07464', 'val_r_square': '0.00486', 'train_loss': '1.56295'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 1: {'val_loss': '1.07207', 'val_r_square': '0.00725', 'train_loss': '1.51289'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 2: {'val_loss': '1.07334', 'val_r_square': '0.00607', 'train_loss': '1.51163'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 3: {'val_loss': '1.07352', 'val_r_square': '0.00590', 'train_loss': '1.51263'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 4: {'val_loss': '1.07190', 'val_r_square': '0.00741', 'train_loss': '1.51293'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 5: {'val_loss': '1.07384', 'val_r_square': '0.00561', 'train_loss': '1.51278'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 6: {'val_loss': '1.07347', 'val_r_square': '0.00594', 'train_loss': '1.51195'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 7: {'val_loss': '1.07384', 'val_r_square': '0.00561', 'train_loss': '1.51156'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 8: {'val_loss': '1.07241', 'val_r_square': '0.00693', 'train_loss': '1.51072'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 9: {'val_loss': '1.07194', 'val_r_square': '0.00737', 'train_loss': '1.51035'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 10: {'val_loss': '1.07358', 'val_r_square': '0.00585', 'train_loss': '1.51007'}
Epoch 00011: reducing learning rate of group 0 to 5.0000e-04.


Validation: |                                                                                    | 0/? [00:00<…

Epoch 11: {'val_loss': '1.07143', 'val_r_square': '0.00784', 'train_loss': '1.50784'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 12: {'val_loss': '1.07097', 'val_r_square': '0.00826', 'train_loss': '1.50706'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 13: {'val_loss': '1.07145', 'val_r_square': '0.00782', 'train_loss': '1.50708'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 14: {'val_loss': '1.07163', 'val_r_square': '0.00765', 'train_loss': '1.50657'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 15: {'val_loss': '1.07000', 'val_r_square': '0.00917', 'train_loss': '1.50622'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 16: {'val_loss': '1.07065', 'val_r_square': '0.00856', 'train_loss': '1.50629'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 17: {'val_loss': '1.06963', 'val_r_square': '0.00951', 'train_loss': '1.50607'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 18: {'val_loss': '1.07043', 'val_r_square': '0.00876', 'train_loss': '1.50596'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 19: {'val_loss': '1.06969', 'val_r_square': '0.00945', 'train_loss': '1.50601'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 20: {'val_loss': '1.07013', 'val_r_square': '0.00904', 'train_loss': '1.50566'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 21: {'val_loss': '1.07041', 'val_r_square': '0.00878', 'train_loss': '1.50543'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 22: {'val_loss': '1.07057', 'val_r_square': '0.00863', 'train_loss': '1.50532'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 23: {'val_loss': '1.06877', 'val_r_square': '0.01030', 'train_loss': '1.50513'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 24: {'val_loss': '1.06921', 'val_r_square': '0.00989', 'train_loss': '1.50492'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 25: {'val_loss': '1.07013', 'val_r_square': '0.00904', 'train_loss': '1.50486'}


Validation: |                                                                                    | 0/? [00:00<…

Epoch 26: {'val_loss': '1.07114', 'val_r_square': '0.00810', 'train_loss': '1.50452'}



Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined