In [1]:
!pip install -qq pytorch_tabnet

In [48]:

from pytorch_tabnet.tab_model import TabNetRegressor
from tqdm import tqdm
from itertools import product
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupKFold
from sklearn.impute import SimpleImputer
import warnings

In [49]:

warnings.filterwarnings("ignore")


In [50]:
hours = range(0, 6)
minutes = range(0, 60, 5)

In [51]:
target_col = 'bg+1-00'
group_col = 'p_num'
date_col = 'time'

In [52]:
from itertools import product

bg_cols   = [f'bg-{i}-{j:02d}' for i, j in product(hours, minutes)][:12]
insu_cols = [f'insulin-{i}-{j:02d}' for i, j in product(hours, minutes)][:12]
carb_cols = [f'carbs-{i}-{j:02d}' for i, j in product(hours, minutes)][:12]
hr_cols   = [f'hr-{i}-{j:02d}' for i, j in product(hours, minutes)][:12]
step_cols = [f'steps-{i}-{j:02d}' for i, j in product(hours, minutes)][:12]
cals_cols = [f'cals-{i}-{j:02d}' for i, j in product(hours, minutes)][:12]


In [53]:
feature_cols = bg_cols + insu_cols + carb_cols + hr_cols + step_cols + cals_cols


In [54]:
df_train = pd.read_csv('Dataset/train.csv/train.csv', index_col='id', parse_dates=['time'])
df_test = pd.read_csv('Dataset/test (1).csv', index_col='id', parse_dates=['time'])
df_subm = pd.read_csv('Dataset/sample_submission.csv', index_col='id')

In [55]:
df_train.columns = df_train.columns.str.replace(':', '-')
df_test.columns = df_test.columns.str.replace(':', '-')

In [56]:
for colset in [bg_cols, insu_cols, carb_cols, hr_cols, step_cols, cals_cols]:
    df_train[colset] = df_train[colset].interpolate(axis=1)
    df_test[colset] = df_test[colset].interpolate(axis=1)

In [57]:
imputer = SimpleImputer()
df_train[feature_cols] = imputer.fit_transform(df_train[feature_cols])
df_test[feature_cols] = imputer.transform(df_test[feature_cols])

In [58]:
df_train['sin_hour'] = np.sin(np.pi * df_train[date_col].dt.hour / 12)
df_train['cos_hour'] = np.cos(np.pi * df_train[date_col].dt.hour / 12)
df_test['sin_hour'] = np.sin(np.pi * df_test[date_col].dt.hour / 12)
df_test['cos_hour'] = np.cos(np.pi * df_test[date_col].dt.hour / 12)

In [59]:
feature_cols.extend(['sin_hour', 'cos_hour'])

In [60]:
grouped_features = [
    [idx for idx, col in enumerate(feature_cols) if col in colset]
    for colset in [bg_cols, insu_cols, carb_cols, hr_cols, step_cols, cals_cols]
]

In [61]:
X = df_train[feature_cols]
y = df_train[[target_col]]
groups = df_train[group_col]
cv = GroupKFold(5)

In [62]:
df_subm['bg+1:00'] = 0.0

In [42]:
for idx_train, idx_valid in cv.split(X, y, groups=groups):
    # Split data into training and validation sets
    X_train, y_train = X.iloc[idx_train].to_numpy(), y.iloc[idx_train].to_numpy()
    X_valid, y_valid = X.iloc[idx_valid].to_numpy(), y.iloc[idx_valid].to_numpy()
    
    # Initialize and train the TabNet Regressor
    reg = TabNetRegressor(grouped_features=grouped_features)
    reg.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric=['rmse'],
        batch_size=1024
    )
    X_test = df_test[feature_cols].to_numpy()
    df_subm['bg+1:00'] += reg.predict(X_test)[:, 0] / 5
    

epoch 0  | loss: 6.77065 | val_0_rmse: 3.02622 |  0:00:18s
epoch 1  | loss: 4.16207 | val_0_rmse: 2.27287 |  0:00:38s
epoch 2  | loss: 4.08278 | val_0_rmse: 2.20317 |  0:00:55s
epoch 3  | loss: 4.01434 | val_0_rmse: 2.23264 |  0:01:13s
epoch 4  | loss: 3.97    | val_0_rmse: 2.19876 |  0:01:30s
epoch 5  | loss: 3.96122 | val_0_rmse: 2.23899 |  0:01:46s
epoch 6  | loss: 3.92669 | val_0_rmse: 2.189   |  0:02:04s
epoch 7  | loss: 3.88666 | val_0_rmse: 2.18074 |  0:02:25s
epoch 8  | loss: 3.86387 | val_0_rmse: 2.21623 |  0:02:42s
epoch 9  | loss: 3.85774 | val_0_rmse: 2.19852 |  0:03:00s
epoch 10 | loss: 3.86054 | val_0_rmse: 2.22196 |  0:03:19s
epoch 11 | loss: 3.86224 | val_0_rmse: 2.23303 |  0:03:37s
epoch 12 | loss: 3.83942 | val_0_rmse: 2.1881  |  0:03:58s
epoch 13 | loss: 3.81797 | val_0_rmse: 2.18083 |  0:04:15s
epoch 14 | loss: 3.82019 | val_0_rmse: 2.17728 |  0:04:33s
epoch 15 | loss: 3.8251  | val_0_rmse: 2.17673 |  0:04:51s
epoch 16 | loss: 3.79224 | val_0_rmse: 2.17644 |  0:05:1

In [43]:
df_subm.to_csv('submissionfinal.csv')

Use TabNet Hyperparameter Tuning

In [None]:
from optuna.terminator.improvement.emmr import torch

# Train and evaluate model
for idx_train, idx_valid in cv.split(X, y, groups=groups):
    X_train, y_train = X.iloc[idx_train].to_numpy(), y.iloc[idx_train].to_numpy()
    X_valid, y_valid = X.iloc[idx_valid].to_numpy(), y.iloc[idx_valid].to_numpy()
    
    reg = TabNetRegressor(
        n_d=32,
        n_a=32,
        n_steps=5,
        gamma=1.5,
        lambda_sparse=1e-3,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=1e-3),
    )
    
    reg.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric=['rmse'],
        batch_size=1024,
        max_epochs=50,
        patience=10,
    )
    
    X_test = df_test[feature_cols].to_numpy()
    df_subm['bg+1:00'] += reg.predict(X_test)[:, 0] / cv.get_n_splits()

In [None]:
df_subm.to_csv('submission3.csv')


In [None]:
from optuna.integration import shap

# Feature importance analysis with SHAP
explainer = shap.Explainer(reg)
shap_values = explainer(X)
shap.summary_plot(shap_values, feature_names=feature_cols)

In [None]:
# Visualize feature importances
plt.figure(figsize=(12, 8))
sns.barplot(
    y=feature_cols[:20],
    x=reg.feature_importances_[:20]
)
plt.title("Feature Importance")
plt.show()