In [1]:
import random
from pathlib import Path

import numpy as np
import pandas as pd
import rootutils
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from sklearn.metrics import r2_score
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

rootutils.setup_root(Path.cwd(), indicator=".project-root", pythonpath=True)

from src.trainer import Trainer  # noqa: E402
from src.utils import EarlyStopping  # noqa: E402

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_float32_matmul_precision("high")
torch.set_num_threads(1)

In [2]:
min_std = 1e-8
train_val_split = (0.8, 0.2)

In [3]:
def seed_everything(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [4]:
seed_everything(42)

In [5]:
columns = pd.read_csv("../data/train.csv", nrows=1).columns.to_list()
features = columns[1:557]
targets = columns[557:]

In [6]:
class XScaler:
    def __init__(self, min_std=1e-8):
        self.mean: np.ndarray
        self.std: np.ndarray
        self.min_std = min_std

    def fit(self, X: np.ndarray) -> None:
        self.mean = X.mean(axis=0)
        self.std = np.maximum(X.std(axis=0), self.min_std)

    def transform(self, X: np.ndarray) -> np.ndarray:
        X = (X - self.mean.reshape(1, -1)) / self.std.reshape(1, -1)
        return X

In [7]:
class YScaler:
    def __init__(self, min_std=1e-8):
        self.mean: np.ndarray
        self.s: np.ndarray
        self.min_std = min_std

    def fit(self, y: np.ndarray):
        self.mean = y.mean(axis=0)
        self.s = np.maximum(np.sqrt((y * y).mean(axis=0)), self.min_std)

    def transform(self, y: np.ndarray) -> np.ndarray:
        y = (y - self.mean.reshape(1, -1)) / self.s.reshape(1, -1)
        return y

    def inverse_transform(self, y: np.ndarray) -> np.ndarray:
        # override constant columns
        for i in range(self.s.shape[0]):
            if self.s[i] < self.min_std * 1.1:
                y[:, i] = 0
        # undo y scaling
        y = y * self.s.reshape(1, -1) + self.mean.reshape(1, -1)
        return y

In [8]:
def read_data(data_dir: Path, n_rows=10_000) -> tuple[np.ndarray, ...]:
    df = pd.read_csv(
        data_dir.joinpath("train.csv"), nrows=n_rows, usecols=list(range(1, 925))
    ).astype("float32")

    nunique_targets = df[features].nunique().to_numpy()

    weights = pd.read_csv(
        data_dir.joinpath("sample_submission.csv"), nrows=1, usecols=list(range(1, 369))
    ).astype("float32")

    weights = weights.to_numpy().reshape(1, -1)

    X = df[features].to_numpy()
    y = df[targets].to_numpy() * weights

    train_size = int(len(X) * train_val_split[0])

    X_train, y_train = X[:train_size, :], y[:train_size, :]

    X_val, y_val = X[train_size:, :], y[train_size:, :]

    return X_train, y_train, X_val, y_val, nunique_targets

In [9]:
X_train, y_train, X_val, y_val, nunique_targets = read_data(
    Path("../data/"), n_rows=1_000_000
)

In [10]:
xscaler = XScaler()
xscaler.fit(X_train)
X_train = xscaler.transform(X_train)
X_val = xscaler.transform(X_val)

In [11]:
y_mean = np.absolute(y_train).mean(axis=0)

In [12]:
class WaveBlock(nn.Module):
    def __init__(self, in_channels, out_channels, dilation_rates, kernel_size):
        super().__init__()
        self.num_rates = dilation_rates
        self.convs = nn.ModuleList()
        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()

        self.convs.append(nn.Conv1d(in_channels, out_channels, kernel_size=1))
        dilation_rates = [2**i for i in range(dilation_rates)]
        for dilation_rate in dilation_rates:
            self.filter_convs.append(
                nn.Conv1d(
                    out_channels,
                    out_channels,
                    kernel_size=kernel_size,
                    padding=int((dilation_rate * (kernel_size - 1)) / 2),
                    dilation=dilation_rate,
                )
            )
            self.gate_convs.append(
                nn.Conv1d(
                    out_channels,
                    out_channels,
                    kernel_size=kernel_size,
                    padding=int((dilation_rate * (kernel_size - 1)) / 2),
                    dilation=dilation_rate,
                )
            )
            self.convs.append(nn.Conv1d(out_channels, out_channels, kernel_size=1))

    def forward(self, x):
        x = self.convs[0](x)
        res = x
        for i in range(self.num_rates):
            x = torch.tanh(self.filter_convs[i](x)) * torch.sigmoid(
                self.gate_convs[i](x)
            )
            x = self.convs[i + 1](x)
            res = res + x
        return res

In [23]:
class WaveNet(nn.Module):
    def __init__(self, output_size, inch=9, kernel_size=3):
        super().__init__()
        self.wave_block1 = WaveBlock(inch, 16, 12, kernel_size)
        self.wave_block2 = WaveBlock(16, 32, 8, kernel_size)
        self.wave_block3 = WaveBlock(32, 64, 4, kernel_size)
        self.wave_block4 = WaveBlock(64, 128, 1, kernel_size)
        self.lstm = nn.LSTM(128, 64, num_layers=2, batch_first=True, bidirectional=True)
        self.fc_in = nn.Linear(60 * 128 + 16, 1024)
        self.fc_out = nn.Linear(1024, output_size)
        self._reinitialize()

    def _reinitialize(self):
        """
        Tensorflow/Keras-like initialization
        """
        for name, p in self.named_parameters():
            if "lstm" in name:
                if "weight_ih" in name:
                    nn.init.xavier_uniform_(p.data)
                elif "weight_hh" in name:
                    nn.init.orthogonal_(p.data)
                elif "bias_ih" in name:
                    p.data.fill_(0)
                    # Set forget-gate bias to 1
                    n = p.size(0)
                    p.data[(n // 4) : (n // 2)].fill_(1)
                elif "bias_hh" in name:
                    p.data.fill_(0)
            elif "fc" in name:
                if "weight" in name:
                    nn.init.xavier_uniform_(p.data)
                elif "bias" in name:
                    p.data.fill_(0)

    def forward(self, x):
        x_seq = torch.cat((x[:, :360], x[:, -180:]), dim=1).view(x.shape[0], 9, 60)
        x_scalar = x[:, 360:376]

        x = self.wave_block1(x_seq)
        x = self.wave_block2(x)
        x = self.wave_block3(x)
        x = self.wave_block4(x)
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = x.reshape(x.shape[0], -1)
        x = torch.cat((x, x_scalar), dim=1)
        x = nn.functional.silu(self.fc_in(x))
        x = nn.functional.dropout(x, p=0.2)
        x = self.fc_out(x)
        return x

In [24]:
class FFNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super().__init__()

        # Initialize the layers
        layers = []
        previous_size = input_size
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(previous_size, hidden_size))
            layers.append(nn.LayerNorm(hidden_size))  # Normalization layer
            layers.append(nn.LeakyReLU(inplace=True))  # Activation
            layers.append(nn.Dropout(p=0.1))  # Dropout for regularization
            previous_size = hidden_size

        # Output layer - no dropout, no activation function
        layers.append(nn.Linear(previous_size, output_size))

        # Register all layers
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

In [25]:
class NumpyDataset(Dataset):
    def __init__(self, x, y):
        """
        Initialize with NumPy arrays.
        """
        assert (
            x.shape[0] == y.shape[0]
        ), "Features and labels must have the same number of samples"
        self.x = x
        self.y = y

    def __len__(self):
        """
        Total number of samples.
        """
        return self.x.shape[0]

    def __getitem__(self, index):
        """
        Generate one sample of data.
        """
        # Convert the data to tensors when requested
        return torch.from_numpy(self.x[index]), torch.from_numpy(self.y[index])

In [26]:
train_dataset = NumpyDataset(X_train, y_train)
val_dataset = NumpyDataset(X_val, y_val)

# train_dataset, val_dataset = torch.utils.data.random_split(
#     dataset, train_val_split, generator=torch.Generator().manual_seed(42)
# )

batch_size = 128
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=10,
    generator=torch.Generator().manual_seed(42),
)
val_loader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, num_workers=10
)

input_size = X_train.shape[1]
output_size = y_train.shape[1]

hidden_size = input_size + output_size

model = WaveNet(output_size)

# model = FFNN(
#     input_size,
#     [3 * hidden_size, 2 * hidden_size, hidden_size, 2 * hidden_size, 3 * hidden_size],
#     output_size,
# )

criterion = nn.MSELoss()  # Using MSE for regression
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=5
)


def preprocessor(
    y_pred: np.ndarray,
    y_true: np.ndarray,
    # y_mean: np.ndarray = y_mean,
    # nunique: np.ndarray = nunique_targets,
) -> tuple[np.ndarray, np.ndarray]:
    scores = r2_score(y_true, y_pred, multioutput="raw_values")

    for idx, score in enumerate(scores):  # type: ignore
        if score <= 0:
            # if score <= 0 or (nunique[idx] == 1 and y_mean[idx] <= 1e-3):
            y_pred[:, idx] = 0

    return y_pred, y_true


early_stopping = EarlyStopping(
    patience=10, verbose=True, delta=0.0, on_each_epoch=False
)

In [27]:
trainer = Trainer(
    model=model,
    loss_func=criterion,
    optimizer=optimizer,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=100,
    score_funcs={"r2": r2_score},
    device=device,
    checkpoint_dir=Path("../logs/"),
    postprocessor=preprocessor,
    early_stopping=early_stopping,
    lr_scheduler=scheduler,
)
results = trainer.train()

Epoch: 1/100

INFO:/projects/kaggle/LEAP/src/utils.py:
Validation loss decreased (inf --> 0.406775).  Saving model ...


Epoch: 2/100

INFO:/projects/kaggle/LEAP/src/utils.py:
Validation loss decreased (0.406775 --> 0.380290).  Saving model ...


Epoch: 3/100

INFO:/projects/kaggle/LEAP/src/utils.py:
Validation loss decreased (0.380290 --> 0.369800).  Saving model ...


Epoch: 4/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 1 out of 10


Epoch: 5/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 2 out of 10


Epoch: 6/100

INFO:/projects/kaggle/LEAP/src/utils.py:
Validation loss decreased (0.369800 --> 0.362186).  Saving model ...


Epoch: 7/100

INFO:/projects/kaggle/LEAP/src/utils.py:
Validation loss decreased (0.362186 --> 0.357999).  Saving model ...


Epoch: 8/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 1 out of 10


Epoch: 9/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 2 out of 10


Epoch: 10/100

INFO:/projects/kaggle/LEAP/src/utils.py:
Validation loss decreased (0.357999 --> 0.357835).  Saving model ...


Epoch: 11/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 1 out of 10


Epoch: 12/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 2 out of 10


Epoch: 13/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 3 out of 10


Epoch: 14/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 4 out of 10


Epoch: 15/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 5 out of 10


Epoch: 16/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 6 out of 10


Epoch: 17/100

INFO:/projects/kaggle/LEAP/src/utils.py:
Validation loss decreased (0.357835 --> 0.349194).  Saving model ...


Epoch: 18/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 1 out of 10


Epoch: 19/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 2 out of 10


Epoch: 20/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 3 out of 10


Epoch: 21/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 4 out of 10


Epoch: 22/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 5 out of 10


Epoch: 23/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 6 out of 10


Epoch: 24/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 7 out of 10


Epoch: 25/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 8 out of 10


Epoch: 26/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 9 out of 10


Epoch: 27/100

INFO:/projects/kaggle/LEAP/src/utils.py:
EarlyStopping counter: 10 out of 10




INFO:/projects/kaggle/LEAP/src/trainer.py:Early stopping


In [28]:
def eval(ckpt_dir: Path) -> tuple[np.ndarray, np.ndarray]:
    state_dict = torch.load(f"{ckpt_dir.joinpath(model.__class__.__name__)}_best.ckpt")[
        "model_state_dict"
    ]
    model.to(device)
    model.load_state_dict(state_dict)
    model.eval()

    preds = []
    targets = []
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader):
            preds.append(model(inputs.to(device)).detach().cpu().numpy())
            targets.append(labels.detach().cpu().numpy())
    preds = np.concatenate(preds)
    targets = np.concatenate(targets)

    return preds, targets

In [29]:
val_preds, val_targets = eval(ckpt_dir=Path("../logs/2024-05-09_20-55-17"))

100%|██████████| 1563/1563 [00:09<00:00, 168.92it/s]


In [30]:
r2_score(val_targets, val_preds)

-5.887160960117868e+40

In [31]:
val_r2_scores = r2_score(val_targets, val_preds, multioutput="raw_values")

In [32]:
for idx, score in enumerate(val_r2_scores):  # type: ignore
    if score <= 0:
        val_preds[:, idx] = 0

In [33]:
r2_score(val_targets, val_preds)

0.5984239527794785

In [34]:
def predict(ckpt_dir: Path, filename: str) -> np.ndarray:
    state_dict = torch.load(f"{ckpt_dir.joinpath(model.__class__.__name__)}_best.ckpt")[
        "model_state_dict"
    ]
    model.to(device)
    model.load_state_dict(state_dict)
    model.eval()
    X = pd.read_csv(filename)[features].astype("float32").to_numpy()

    X = xscaler.transform(X)

    test_dataset = NumpyDataset(X, np.zeros((X.shape[0], output_size)))

    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, drop_last=False
    )

    preds = []
    with torch.no_grad():
        for inputs, _ in tqdm(test_loader):
            preds.append(model(inputs.to(device)).detach().cpu().numpy())
    preds = np.concatenate(preds)

    return preds

In [35]:
preds = predict(
    ckpt_dir=Path("../logs/2024-05-09_20-55-17"), filename="../data/test.csv"
)

100%|██████████| 4883/4883 [00:23<00:00, 210.22it/s]


In [36]:
def postprocessing(
    data_dir: Path,
    y: np.ndarray,
    val_r2_scores: np.ndarray,
    # y_mean: np.ndarray = y_mean,
    # nunique: np.ndarray = nunique_targets,
) -> np.ndarray:
    weights = (
        pd.read_csv(
            data_dir.joinpath("sample_submission.csv"),
            nrows=1,
            usecols=list(range(1, 369)),
        )
        .astype("float32")
        .to_numpy()
        .reshape(1, -1)
    )

    for idx, weight in enumerate(weights[0]):
        if weight > 0:
            continue
        else:
            y[:, idx] = 0

    for idx, score in enumerate(val_r2_scores):  # type:ignore
        if score <= 0:
        # if score <= 0 or (nunique[idx] == 1 and y_mean[idx] <= 1e-3):
            y[:, idx] = 0

    return y

In [37]:
preds = postprocessing(
    data_dir=Path("../data/"),
    y=preds,
    val_r2_scores=val_r2_scores,  # type:ignore
)

In [38]:
def submit(data_dir: Path, y: np.ndarray) -> None:
    samples_submission = pd.read_csv(data_dir.joinpath("sample_submission.csv"))
    samples_submission.iloc[:, 1:] = y
    samples_submission.to_parquet(
        data_dir.joinpath("submission_wavenet_lstm_10_percent_of_data.parquet"), index=False
    )

In [39]:
submit(Path("../data/"), preds)