In [1]:
import numpy as np, pandas as pd, os
from sklearn.model_selection import cross_val_score, StratifiedKFold
import xgboost as xgb
import plotly.express as px, seaborn as sns, matplotlib.pyplot as plt
sns.set_style('darkgrid')
from sklearn.metrics import make_scorer, cohen_kappa_score
path = '../input/child-mind-institute-problematic-internet-use/'
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')
train = pd.read_csv(path + 'train.csv', index_col = 'id')
print("The train data has the shape: ",train.shape)
test = pd.read_csv(path + 'test.csv', index_col = 'id')
print("The test data has the shape: ",test.shape)
print("")
print("Total number of missing training values: ", train.isna().sum().sum())
train_cat_columns = train.select_dtypes(exclude = 'number').columns

for season in train_cat_columns:
    train[season] = train[season].replace({'Spring':1, 'Summer':2, 'Fall':3, 'Winter':4})
PCIAT_cols = [val for val in train.columns[train.columns.str.contains('PCIAT')]]
PCIAT_cols.remove('PCIAT-PCIAT_Total')
train = train.drop(columns = PCIAT_cols)
train = train.dropna(subset='sii')

The train data has the shape:  (3960, 81)
The test data has the shape:  (20, 58)

Total number of missing training values:  131717


  train[season] = train[season].replace({'Spring':1, 'Summer':2, 'Fall':3, 'Winter':4})


In [2]:
def stratified_split_data(data, ratio=0.7,seed=42):
    train_model = []
    train_val = []

    # Loop over each unique category in 'sii'
    for sii_value in data['sii'].unique():
        subset = data[data['sii'] == sii_value]  # Filter by current sii value
        
        # Shuffle the subset for randomness
        subset = subset.sample(frac=1, random_state=seed).reset_index(drop=True)
        
        # Split the data for this sii category
        split_index = round(len(subset) * ratio)
        train_model.append(subset[:split_index])
        train_val.append(subset[split_index:])

        print(f"sii = {sii_value} | Train: {len(train_model[-1])} | Val: {len(train_val[-1])}")

    # Concatenate all sii-category splits
    train_model = pd.concat(train_model).reset_index(drop=True)
    train_val = pd.concat(train_val).reset_index(drop=True)

    return train_model, train_val


def convert(scores):
    scores = np.array(scores)*1.3
    bins = np.zeros_like(scores)
    bins[scores <= 30] = 0
    bins[(scores > 30) & (scores < 50)] = 1
    bins[(scores >= 50) & (scores < 80)] = 2
    bins[scores >= 80] = 3
    return bins
def quadratic_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class that handles sample IDs from the index
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        df = pd.DataFrame(data)
        df = df.fillna(0.0)
        self.data = torch.tensor(df.values, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
        self.sample_ids = data.index  # Use the index as sample IDs
        self.mask=np.array(data.isna().astype(int))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx], self.sample_ids[idx],self.mask[idx]

# Simple Neural Network Model
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.BatchNorm1d(64),  # Add batch normalization
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),  # Add batch normalization
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # Output between 0 and 1
        )

    def forward(self, x):
        return self.network(x) * 100
def train_XG_mask_nns(data, target_df, num_epochs=200,num_models=3, batch_size=64, lr=0.0025):
    models={}
    losses=np.array([])
    

    # Loss function (assuming regression task, modify for classification)
    criterion = nn.MSELoss()

    # Store predictions and sample IDs for mapping later
    all_predictions = {}
    # Extract targets matching the sample IDs in separated_data
    matching_targets = target_df.loc[data.index]  # Ensure target_df index aligns with data.index
    
    # Create dataset and data loader
    dataset = CustomDataset(data, matching_targets.values)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initialize model, optimizer, and store them
    for ith_model in range(num_models):
        print(f"\nTraining Model {ith_model}...")
        input_size = data.shape[1]
        model = SimpleNN(input_size)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        if not losses.size== 0:
            # Create a new dataset using the previous losses as targets
            dataset = CustomDataset(data, losses)
            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            losses=np.array([])
    # Training loop
        for epoch in range(num_epochs):
            model.train()
            for inputs, target, ids,mask in dataloader:
                optimizer.zero_grad()
                output = model(inputs).squeeze()
                loss = criterion(output, target)
                loss= loss * (1-mask).float().mean(dim=1)   # Mask out NaN positions
                loss=loss.mean()
                loss.backward()
                optimizer.step()
            #if epoch % 50==0:
                #print(ith_model)
                #print('Epoch'+str(epoch))
                #print(loss)

        model.eval()
        for inputs, target, ids,mask in dataloader:
            optimizer.zero_grad()
            output = model(inputs).squeeze()
            loss = target-output
            loss= loss * (1-mask).float().mean(dim=1)  # Mask out NaN positions
            losses=np.append(losses,loss.detach().numpy())

        models[ith_model] = model
        
 

    return models
def run_XG_mask_nns(test_data, models,batch_size=32):
    output_list=np.array([])
    index_name=[]

    # Initialize model, optimizer, and store them
    for ith_model in range(len(models)):
        model=models[ith_model]
        model.eval()
        #print(f"\nrunning Model...")
        test_inputs = torch.tensor(test_data.values, dtype=torch.float32)
        test_inputs = torch.nan_to_num(test_inputs, nan=0.0)
    # Training loop
        with torch.no_grad():
            output = model(test_inputs).squeeze()
        if output_list.size==0:
            output_list=output
        else:
            output_list=output_list+output
    all_predictions = pd.DataFrame({'Sample_ID':  test_data.index, 'Prediction': convert(output_list)})
        
 

    return all_predictions
    

In [4]:

train_model,train_val=stratified_split_data(train, ratio=0.8)
targets_train = train_model["PCIAT-PCIAT_Total"]
run_model_mask_nn_train = train_model.drop(columns = ['PCIAT-PCIAT_Total','sii'])
targets_val = train_val["PCIAT-PCIAT_Total"]
run_model_mask_nn_val = train_val.drop(columns = ['PCIAT-PCIAT_Total','sii'])
models= train_XG_mask_nns(run_model_mask_nn_train, targets_train,num_models=3,num_epochs=200,lr=0.0025)

output_train=run_XG_mask_nns(run_model_mask_nn_train,models)
output_val=run_XG_mask_nns(run_model_mask_nn_val,models)
train_score=quadratic_kappa(convert(targets_train[output_train['Sample_ID'].values].values),output_train['Prediction'].values)
val_score=quadratic_kappa(convert(targets_val[output_val['Sample_ID'].values].values),output_val['Prediction'].values)

sii = 2.0 | Train: 302 | Val: 76
sii = 0.0 | Train: 1275 | Val: 319
sii = 1.0 | Train: 584 | Val: 146
sii = 3.0 | Train: 27 | Val: 7

Training Model 0...

Training Model 1...

Training Model 2...


In [5]:
print(train_score)
print(val_score)

0.523230099261728
0.3854210051367486


In [6]:
# Make predictions on test data for each category
test_cat = test.select_dtypes(exclude = 'number').columns

for season in test_cat:
    test[season] = test[season].replace({'Spring':1, 'Summer':2, 'Fall':3, 'Winter':4})

test_predictions = run_XG_mask_nns(test, models)

preds = pd.Series(test_predictions['Prediction'].values.ravel(), dtype='float32')
preds.index = test.index
preds.to_csv('submission.csv')

  test[season] = test[season].replace({'Spring':1, 'Summer':2, 'Fall':3, 'Winter':4})


In [7]:
np.round(preds)

id
00008ff9    1.0
000fd460    0.0
00105258    1.0
00115b9f    1.0
0016bb22    0.0
001f3379    1.0
0038ba98    0.0
0068a485    0.0
0069fbed    1.0
0083e397    0.0
0087dd65    0.0
00abe655    1.0
00ae59c9    2.0
00af6387    0.0
00bd4359    1.0
00c0cd71    0.0
00d56d4b    0.0
00d9913d    0.0
00e6167c    1.0
00ebc35d    1.0
dtype: float32

In [8]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
from sklearn.base import BaseEstimator, RegressorMixin

In [9]:
class TrainXGMaskNNSRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, num_epochs=200, num_models=3, batch_size=64, lr=0.0025):
        self.num_epochs = num_epochs
        self.num_models = num_models
        self.batch_size = batch_size
        self.lr = lr
        self.models = None  # This will store trained models after fitting

    def fit(self, X, y):
        # Ensure input data is in DataFrame format for compatibility
        data = pd.DataFrame(X)
        target_df = pd.Series(y, index=data.index)
        
        # Train models using the original train_XG_mask_nns function
        self.models = train_XG_mask_nns(data, target_df, num_epochs=self.num_epochs, 
                                        num_models=self.num_models, batch_size=self.batch_size, lr=self.lr)
        return self

    def predict(self, X):
        # Convert test data to DataFrame if not already
        test_data = pd.DataFrame(X)
        
        # Use the run_XG_mask_nns function for prediction, which averages model outputs
        predictions_df = run_XG_mask_nns(test_data, self.models, batch_size=self.batch_size)
        
        # Extract and return predictions as a NumPy array
        return convert(predictions_df['Prediction'].values)

Tab Net


In [10]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [11]:
SEED=42
n_splits = 5

In [12]:
from pytorch_tabnet.tab_model import TabNetRegressor
import torch

if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)
def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    X= X.drop(['PCIAT-PCIAT_Total'],axis=1)
    y = train['sii']
    print(X.shape)
    print(y.shape)
    print(test_data.shape)
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        print(model)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [13]:
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'gpu'

}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'gpu_hist',

}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

In [14]:
# New: TabNet

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from pytorch_tabnet.callbacks import Callback
import os
import torch
from pytorch_tabnet.callbacks import Callback

class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'
        
    def fit(self, X, y):
        # Handle missing values
        
        X_imputed = self.imputer.fit_transform(X)
        
        if hasattr(y, 'values'):
            y = y.values
            
        # Create internal validation set
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, 
            y, 
            test_size=0.2,
            random_state=42
        )
        
        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=500,
            patience=50,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )
        
        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file
        
        return self
    
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()
    
    def __deepcopy__(self, memo):
        # Add deepcopy support for scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

# TabNet hyperparameters
TabNet_Params = {
    'n_d': 64,              # Width of the decision prediction layer
    'n_a': 64,              # Width of the attention embedding for each step
    'n_steps': 5,           # Number of steps in the architecture
    'gamma': 1.5,           # Coefficient for feature selection regularization
    'n_independent': 2,     # Number of independent GLU layer in each GLU block
    'n_shared': 2,          # Number of shared GLU layer in each GLU block
    'lambda_sparse': 1e-4,  # Sparsity regularization
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', 
                 save_best_only=True, verbose=1):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')
        
    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model
        
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return
        
        # Check if current metric is better than best
        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)  # Save the entire model

In [15]:
# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params) # New
train_xg_regressor = TrainXGMaskNNSRegressor(num_epochs=200, num_models=3, batch_size=64, lr=0.0025)

In [16]:
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model),
    ('train_xg', train_xg_regressor)
])

Submission1 = TrainML(voting_model, test)

Submission1

Training Folds: 100%|██████████| 5/5 [08:05<00:00, 97.03s/it]

Mean Train QWK --> 0.6472
Mean Validation QWK ---> 0.3549





----> || Optimized QWK SCORE :: [36m[1m 0.463[0m


Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,1
