In [1]:
import os
import random
from tqdm import tqdm
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer

from scipy.optimize import minimize
import optuna

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


SEED = 42

KAPPA_SCORER = make_scorer(
    cohen_kappa_score, 
    greater_is_better=True, 
    weights='quadratic',
)

In [2]:
def lgb_objective(trial):
    params = {
        'objective':         'l2',
        'verbosity':         -1,
        'n_iter':            200,
        'random_state':      SEED,
        'boosting_type':     'gbdt',
        'lambda_l1':         trial.suggest_float('lambda_l1', 1e-3, 10.0, log=True),
        'lambda_l2':         trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
        'learning_rate':     trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth':         trial.suggest_int('max_depth', 4, 8),
        'num_leaves':        trial.suggest_int('num_leaves', 16, 256),
        'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bynode':  trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'bagging_fraction':  trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq':      trial.suggest_int('bagging_freq', 1, 7),
        'min_data_in_leaf':  trial.suggest_int('min_data_in_leaf', 5, 100),
    }
    
    X = df_train[feature_cols]
    y = df_train[target_col]
    cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
    estimator = CustomLGBMRegressor(**params)

    val_scores = cross_val_score(
        estimator=estimator, 
        X=X, y=y, 
        cv=cv, 
        scoring=KAPPA_SCORER,
    )

    return np.mean(val_scores)

In [3]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    
    return df.describe().values.reshape(-1), filename.split('=')[1]

In [4]:
def load_time_series(dirname):
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

In [5]:
def quadratic_weighted_kappa(estimator, X, y_true):
    y_pred = estimator.predict(X).round()
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [6]:
def threshold_rounder(y_pred, thresholds):
    return np.where(y_pred < thresholds[0], 0,
                    np.where(y_pred < thresholds[1], 1,
                             np.where(y_pred < thresholds[2], 2, 3)))

In [7]:
def eval_preds(thresholds, y_true, y_pred):
    y_pred = threshold_rounder(y_pred, thresholds)
    score = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return -score

In [8]:
class CustomLGBMRegressor(lgb.LGBMRegressor):
    '''
    Custom LightGBM Regressor
    
    It optimizes threshold values during fitting.
    Main goal is preventing overfit on validation data.
    '''
    def fit(self, X, y, **kwargs):
        super().fit(X, y, **kwargs)
        y_pred = super().predict(X, **kwargs)
        
        self.optimizer = minimize(
            eval_preds, 
            x0=[0.5, 1.5, 2.5], 
            args=(y, y_pred), 
            method='Nelder-Mead',
        )
        
    def predict(self, X, **kwargs):
        y_pred = super().predict(X, **kwargs)
        y_pred = threshold_rounder(y_pred, self.optimizer.x)
        return y_pred

In [9]:
root = Path('/kaggle/input/child-mind-institute-problematic-internet-use')

### Tabular Data

In [10]:
df_train = pd.read_csv(root / 'train.csv')
df_test = pd.read_csv(root / 'test.csv')
df_subm = pd.read_csv(root / 'sample_submission.csv', index_col='id')

### Time Series Data

In [11]:
ts_train = load_time_series(root / "series_train.parquet")
ts_test = load_time_series(root / "series_test.parquet")

time_series_cols = ts_train.columns.tolist()
time_series_cols.remove("id")

100%|██████████| 996/996 [01:38<00:00, 10.14it/s]
100%|██████████| 2/2 [00:00<00:00,  7.93it/s]


In [12]:
len(ts_train)

996

### Merge Operation

In [13]:
df_train = pd.merge(df_train, ts_train, how="left", on='id')
df_test = pd.merge(df_test, ts_test, how="left", on='id')

df_train = df_train.set_index('id')
df_test = df_test.set_index('id')

### Global Variables

In [14]:
cat_cols = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']
num_cols = ['Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference', 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PreInt_EduHx-computerinternet_hoursday']
tabular_cols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Sex', 'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',  'Physical-Waist_Circumference', 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday']
target_col = 'sii'

feature_cols = tabular_cols + time_series_cols
num_cols = num_cols + time_series_cols

### Drop Rows with Missing Targets

In [15]:
df_train = df_train.dropna(subset=[target_col])

### Numeric Value Imputing

In [16]:
imputer = SimpleImputer(
    strategy='mean',
)

df_train[num_cols] = imputer.fit_transform(df_train[num_cols])
df_test[num_cols] = imputer.transform(df_test[num_cols])

### Category Encoding

In [17]:
encoder = OrdinalEncoder(
    dtype=np.int32,
    handle_unknown='use_encoded_value',
    unknown_value=-1,
    encoded_missing_value=-2,
)

df_train[cat_cols] = encoder.fit_transform(df_train[cat_cols])
df_train[cat_cols] = df_train[cat_cols].astype('category')

df_test[cat_cols] = encoder.transform(df_test[cat_cols])
df_test[cat_cols] = df_test[cat_cols].astype('category')

### Optuna - Hyperparameter Tuning

In [18]:
# study = optuna.create_study(direction='maximize', study_name='Regressor')
# study.optimize(lgb_objective, n_trials=30, show_progress_bar=True)

### Tuned Hyperparameters

In [19]:
params = {
    'objective'       : 'l2',
    'verbosity'       : -1,
    'n_iter'          : 200,
    'lambda_l1'       : 0.005116829730239727,
    'lambda_l2'       : 0.0011520776712645852,
    'learning_rate'   : 0.02376367323636638,
    'max_depth'       : 5,
    'num_leaves'      : 207,
    'colsample_bytree': 0.7759862336963801,
    'colsample_bynode': 0.5110355095943208,
    'bagging_fraction': 0.5485770314992224,
    'bagging_freq'    : 7,
    'min_data_in_leaf': 78,
}

model = CustomLGBMRegressor(**params, random_state=SEED)

### Cross Validation

In [20]:
df_train['Physical-BMI'] = df_train['Physical-BMI'] /df_train['BIA-BIA_BMI'] 

In [21]:
import pandas as pd
from sklearn.decomposition import PCA
data = df_train[['Physical-Weight', 'Physical-Height', 'Basic_Demos-Age']].values
pca = PCA(n_components=1)
df_train['PCA_Physical'] = pca.fit_transform(data)

In [22]:
X = df_train[feature_cols]
y = df_train[target_col]
cv = StratifiedKFold(5, shuffle=True, random_state=SEED)

val_scores = cross_val_score(
    model, X, y, cv=cv, 
    scoring=KAPPA_SCORER,
)

print(f'kappa score: {np.mean(val_scores):.4f}')

kappa score: 0.4221


### Seed Ensembling

In [23]:
model = VotingRegressor([
    ('lgb_0', CustomLGBMRegressor(**params, random_state=12)),
    ('lgb_1', CustomLGBMRegressor(**params, random_state=22)),
    ('lgb_2', CustomLGBMRegressor(**params, random_state=32)),
    ('lgb_3', CustomLGBMRegressor(**params, random_state=42)),
    ('lgb_4', CustomLGBMRegressor(**params, random_state=52)),
    ('lgb_5', CustomLGBMRegressor(**params, random_state=62)),
    ('lgb_6', CustomLGBMRegressor(**params, random_state=72)),
    ('lgb_7', CustomLGBMRegressor(**params, random_state=82)),
    ('lgb_8', CustomLGBMRegressor(**params, random_state=92)),
    ('lgb_9', CustomLGBMRegressor(**params, random_state=102)),
])

### Training

In [24]:
X = df_train[feature_cols]
y = df_train[target_col]

model.fit(X, y)

### Prediction

In [25]:
df_subm[target_col] = model.predict(df_test[feature_cols])
df_subm[target_col] = df_subm[target_col].round()

df_subm.to_csv('submission.csv')