In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch
import numpy as np, pandas as pd, os
from sklearn.model_selection import cross_val_score, StratifiedKFold
import xgboost as xgb
import plotly.express as px, seaborn as sns, matplotlib.pyplot as plt
sns.set_style('darkgrid')
from sklearn.metrics import make_scorer, cohen_kappa_score
path = '../input/child-mind-institute-problematic-internet-use/'

train = pd.read_csv(path + 'train.csv', index_col = 'id')
print("The train data has the shape: ",train.shape)
test = pd.read_csv(path + 'test.csv', index_col = 'id')
print("The test data has the shape: ",test.shape)
print("")
print("Total number of missing training values: ", train.isna().sum().sum())
train_cat_columns = train.select_dtypes(exclude = 'number').columns

for season in train_cat_columns:
    train[season] = train[season].replace({'Spring':1, 'Summer':2, 'Fall':3, 'Winter':4})
PCIAT_cols = [val for val in train.columns[train.columns.str.contains('PCIAT')]]
PCIAT_cols.remove('PCIAT-PCIAT_Total')
train = train.drop(columns = PCIAT_cols)
train = train.dropna(subset='sii')




The train data has the shape:  (3960, 81)
The test data has the shape:  (20, 58)

Total number of missing training values:  131717


  train[season] = train[season].replace({'Spring':1, 'Summer':2, 'Fall':3, 'Winter':4})


In [2]:
def stratified_split_data(data, ratio=0.7):
    train_model = []
    train_val = []

    # Loop over each unique category in 'sii'
    for sii_value in data['sii'].unique():
        subset = data[data['sii'] == sii_value]  # Filter by current sii value
        
        # Shuffle the subset for randomness
        subset = subset.sample(frac=1).reset_index(drop=True)
        
        # Split the data for this sii category
        split_index = round(len(subset) * ratio)
        train_model.append(subset[:split_index])
        train_val.append(subset[split_index:])

        print(f"sii = {sii_value} | Train: {len(train_model[-1])} | Val: {len(train_val[-1])}")

    # Concatenate all sii-category splits
    train_model = pd.concat(train_model).reset_index(drop=True)
    train_val = pd.concat(train_val).reset_index(drop=True)

    return train_model, train_val


def convert(scores):
    scores = np.array(scores)*1.3
    bins = np.zeros_like(scores)
    bins[scores <= 30] = 0
    bins[(scores > 30) & (scores < 50)] = 1
    bins[(scores >= 50) & (scores < 80)] = 2
    bins[scores >= 80] = 3
    return bins
def quadratic_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')



In [3]:
from pytorch_tabnet.metrics import Metric
class QWK(Metric):
    def __init__(self):
        self._name = "QWK"
        self._maximize = True
    def __call__(self, y_true, y_score):
        return quadratic_kappa(convert(y_true),convert(y_score[:, 1]))

In [4]:
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="sparsemax" # "sparsemax"
)



In [5]:
from sklearn.impute import SimpleImputer

train_model,train_val=stratified_split_data(train, ratio=0.8)

y_train = train_model["PCIAT-PCIAT_Total"]
X_train = train_model.drop(columns = ['PCIAT-PCIAT_Total','sii'])
y_val = train_val["PCIAT-PCIAT_Total"]
X_val = train_val.drop(columns = ['PCIAT-PCIAT_Total','sii'])
def median_impute(x):
    return SimpleImputer(strategy='median').fit_transform(x)


sii = 2.0 | Train: 302 | Val: 76
sii = 0.0 | Train: 1275 | Val: 319
sii = 1.0 | Train: 584 | Val: 146
sii = 3.0 | Train: 27 | Val: 7


In [6]:
np.reshape(y_train,(-1,1)).shape

(2188, 1)

In [7]:

clf = TabNetRegressor()  #TabNetRegressor()
clf.fit(
    median_impute(X_train), np.reshape(y_train,(-1,1)),
    eval_set=[(median_impute(X_val), np.reshape(y_val,(-1,1)))],
    eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=500,
            patience=50,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
)





epoch 0  | loss: 1237.13913| valid_mse: 1659.55098|  0:00:00s
epoch 1  | loss: 1179.59896| valid_mse: 1028.94551|  0:00:00s
epoch 2  | loss: 1129.89179| valid_mse: 782.12074|  0:00:00s
epoch 3  | loss: 1085.97889| valid_mse: 720.3843|  0:00:00s
epoch 4  | loss: 1037.51923| valid_mse: 676.68419|  0:00:01s
epoch 5  | loss: 982.19774| valid_mse: 612.06983|  0:00:01s
epoch 6  | loss: 931.9682| valid_mse: 566.72749|  0:00:01s
epoch 7  | loss: 864.46088| valid_mse: 550.96457|  0:00:01s
epoch 8  | loss: 797.74312| valid_mse: 555.01002|  0:00:02s
epoch 9  | loss: 726.29408| valid_mse: 635.13141|  0:00:02s
epoch 10 | loss: 652.84161| valid_mse: 944.15218|  0:00:02s
epoch 11 | loss: 579.32155| valid_mse: 881.0662|  0:00:02s
epoch 12 | loss: 514.98127| valid_mse: 639.03158|  0:00:02s
epoch 13 | loss: 451.65722| valid_mse: 558.10742|  0:00:02s
epoch 14 | loss: 401.26043| valid_mse: 612.91549|  0:00:03s
epoch 15 | loss: 364.4476| valid_mse: 830.60864|  0:00:03s
epoch 16 | loss: 347.36505| valid_mse



In [8]:
# Make predictions on test data for each category
test_cat = test.select_dtypes(exclude = 'number').columns

for season in test_cat:
    test[season] = test[season].replace({'Spring':1, 'Summer':2, 'Fall':3, 'Winter':4})

preds = clf.predict(median_impute(test))

  test[season] = test[season].replace({'Spring':1, 'Summer':2, 'Fall':3, 'Winter':4})


In [9]:
preds.ravel()

array([36.845295, 24.542248, 37.02198 , 25.365894, 42.692505, 26.043064,
       31.108494, 28.537746, 39.68969 , 43.56147 , 36.026424, 36.937553,
       40.967224, 35.411396, 41.697468, 19.15818 ,  8.224137, 30.65409 ,
       33.231113, 35.063198], dtype=float32)

In [10]:
preds = pd.Series(convert(preds.ravel()), dtype='float32')
preds.index = test.index
preds.to_csv('submission.csv')
preds

id
00008ff9    1.0
000fd460    1.0
00105258    1.0
00115b9f    1.0
0016bb22    2.0
001f3379    1.0
0038ba98    1.0
0068a485    1.0
0069fbed    2.0
0083e397    2.0
0087dd65    1.0
00abe655    1.0
00ae59c9    2.0
00af6387    1.0
00bd4359    2.0
00c0cd71    0.0
00d56d4b    0.0
00d9913d    1.0
00e6167c    1.0
00ebc35d    1.0
dtype: float32