In [1]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [2]:
# Import libraries
import gc
import os
import shap
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer 
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, cohen_kappa_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import (
    RandomForestRegressor, HistGradientBoostingRegressor, VotingRegressor,
    ExtraTreesRegressor
)

import ydf
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from scipy.optimize import minimize 

import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output

# Tabnet
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from pytorch_tabnet.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(42)

In [4]:
def hour_category_vectorized(hours):
    categories = np.full_like(hours, 'h3', dtype=object)
    categories[(hours >= 0) & (hours < 8)] = 'h1'
    categories[(hours >= 8) & (hours < 16)] = 'h2'
    return categories

def advanced_ts_features(df):
    df['hour'] = ((df['time_of_day'] // 1e9) % 86400 // 3600).astype(np.int64)
    df['hour_category'] = hour_category_vectorized(df['hour'].values)
    df['abs_action'] = np.sqrt(df['X']**2 + df['Y']**2 + df['Z']**2)

    hour_info_df = df.groupby('hour_category')[['abs_action', 'enmo', 'light']].agg(['mean', 'std'])
    hour_info_df.columns = ['_'.join(col) for col in hour_info_df.columns]
    hour_info_df = hour_info_df.unstack().to_frame().T
    hour_info_df.columns = ['_'.join(map(str, idx)) for idx in hour_info_df.columns]
    return hour_info_df

In [5]:
def process_parquet_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))  
    features_df = df.describe().unstack()
    features_df.index =  ['_'.join(idx) for idx in features_df.index]  
    features_df = features_df.to_frame().T
    advanced_features_df = advanced_ts_features(df) 

    features_df = pd.concat([features_df, advanced_features_df], axis=1)
    features_df['id'] = filename.split('=')[1]
    
    del df
    gc.collect()
    
    return features_df

def load_time_series(dirname):
    filenames = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(
            lambda filename: process_parquet_file(filename, dirname), filenames
        ), total=len(filenames)))

    gc.collect()    
    return pd.concat(results)

In [6]:
INPUT_DIR = '/kaggle/input/child-mind-institute-problematic-internet-use'

train_df = pd.read_csv(f'{INPUT_DIR}/train.csv')
test_df = pd.read_csv(f'{INPUT_DIR}/test.csv')

train_ts = load_time_series(f'{INPUT_DIR}/series_train.parquet')
test_ts = load_time_series(f'{INPUT_DIR}/series_test.parquet')

train_df = train_df.merge(train_ts, on=['id'], how='left')
test_df = test_df.merge(test_ts, on=['id'], how='left')

100%|██████████| 996/996 [08:25<00:00,  1.97it/s]
100%|██████████| 2/2 [00:01<00:00,  1.95it/s]


In [7]:
train_df_bkp = train_df.copy()
test_df_bkp = test_df.copy()

In [8]:
## Fix issue with incorrect target labels
PCIAT_cols = [f'PCIAT-PCIAT_{i+1:02d}' for i in range(20)]

def recalculate_sii(row):
    if pd.isna(row['PCIAT-PCIAT_Total']):
        return np.nan
    max_possible = row['PCIAT-PCIAT_Total'] + row[PCIAT_cols].isna().sum() * 5
    if row['PCIAT-PCIAT_Total'] <= 30 and max_possible <= 30:
        return 0
    elif 31 <= row['PCIAT-PCIAT_Total'] <= 49 and max_possible <= 49:
        return 1
    elif 50 <= row['PCIAT-PCIAT_Total'] <= 79 and max_possible <= 79:
        return 2
    elif row['PCIAT-PCIAT_Total'] >= 80 and max_possible >= 80:
        return 3
    return np.nan

train_df.loc[:, ['sii']] = train_df.apply(recalculate_sii, axis=1)

# Remove rows where target is missing
train_df = train_df[train_df['sii'].notna()].reset_index(drop=True)

In [9]:
def clean_data(data):
    # Set incorrect weights and corresponding BMI to np.nan
    data.loc[
        data['Physical-Weight'] == 0, ['Physical-Weight', 'Physical-BMI']
    ] = np.nan
    # Set incorrect FAT data to np.nan
    data.loc[
        data['BIA-BIA_Fat'] < 0, ['BIA-BIA_Fat', 'BIA-BIA_FMI']
    ] = np.nan
    season_cols = [
        col for col in data.columns 
        if 'Season' in col and not 'PCIAT' in col
    ]
    data = data.drop(season_cols, axis=1)
    return data

def feature_engineering(data):
    data['age_sds_raw'] = data['Basic_Demos-Age'] * data['SDS-SDS_Total_Raw']
    data['bfp_bmr'] = data['BIA-BIA_Fat'] * data['BIA-BIA_BMR']
    data['bfp_dee'] = data['BIA-BIA_Fat'] * data['BIA-BIA_DEE']
    data['bmi_age'] = data['Physical-BMI'] * data['Basic_Demos-Age']
    data['bmi_internet_hours'] = data['Physical-BMI'] * data['PreInt_EduHx-computerinternet_hoursday']
    data['bmi_phr'] = data['Physical-BMI'] * data['Physical-HeartRate']
    data['internet_hours_age'] = data['PreInt_EduHx-computerinternet_hoursday'] * data['Basic_Demos-Age']
    data['waist_weight'] = data['Physical-Waist_Circumference'] * data['Physical-Weight']
    data['weight_sds_total'] = data['Physical-Weight'] * data['SDS-SDS_Total_T']
    
    data['bfp_bmi'] = data['BIA-BIA_Fat'] / data['BIA-BIA_BMI']
    data['bmr_weight'] = data['BIA-BIA_BMR'] / data['Physical-Weight']
    data['dee_weight'] = data['BIA-BIA_DEE'] / data['Physical-Weight']
    data['ffmi_bfp'] = data['BIA-BIA_FFMI'] / data['BIA-BIA_Fat']
    data['fmi_bfp'] = data['BIA-BIA_FMI'] / data['BIA-BIA_Fat']
    data['hydration_status'] = data['BIA-BIA_TBW'] / data['Physical-Weight']
    data['icw_tbw'] = data['BIA-BIA_ICW'] / data['BIA-BIA_TBW']
    data['lst_tbw'] = data['BIA-BIA_LST'] / data['BIA-BIA_TBW']
    data['muscle_to_fat'] = data['BIA-BIA_SMM'] / data['BIA-BIA_FMI']
    data['smm_height'] = data['BIA-BIA_SMM'] / data['Physical-Height']

    data = data.replace({np.inf: np.nan, -np.inf: np.nan})
    return data

train_df = clean_data(train_df)
test_df = clean_data(test_df)

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

columns_not_in_test = list(set(train_df.columns) - set(test_df.columns))
columns_not_in_test = sorted(columns_not_in_test)
columns_not_in_test.remove('sii') # Remove target column

train_df = train_df.drop(columns_not_in_test, axis=1)

In [10]:
SEED = 42
n_splits = 5

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, train_df, test_data):
    X = train_df.drop(['id', 'sii'], axis=1)
    y = train_df['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data[X.columns])
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.49, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    thresholds = KappaOPtimizer.x
    
    oof_tuned = threshold_Rounder(oof_non_rounded, thresholds)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, thresholds)
    
    submission = pd.DataFrame({
        'id': test_df['id'],
        'sii': tpTuned
    })

    return submission, oof_tuned, y

In [11]:
track = {}

In [12]:
lgbm_params = {
    "subsample_freq": 4,
    "subsample": 0.5,
    "num_leaves": 128,
    "n_estimators": 512,
    "max_depth": 12,
    "max_bin": 31,
    "learning_rate": 0.01,
    "lambda_l2": 1,
    "lambda_l1": 10,
    "feature_fraction": 1.0,
    'deterministic': True,
    'force_col_wise': True,
    'random_state': SEED,
    'verbose': -1
}

d1_lgbm_submission, oof_tuned, y = TrainML(LGBMRegressor(**lgbm_params), train_df, test_df)
track['d1_lgbm_submission'] = {
    'test': d1_lgbm_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:06<00:00,  1.29s/it]

Mean Train QWK --> 0.5625
Mean Validation QWK ---> 0.4027





----> || Optimized QWK SCORE :: [36m[1m 0.466[0m


In [14]:
# XGBoost parameters
xgb_params = {
    "subsample": 0.7,
    "reg_lambda": 10,
    "reg_alpha": 10,
    "n_estimators": 256,
    "min_child_weight": 5,
    "max_depth": 10,
    "max_bin": 127,
    "learning_rate": 0.05,
    "gamma": 0.1,
    "colsample_bytree": 0.6,
    "objective": "reg:squarederror",
    "tree_method": "hist",
    "random_state": SEED,
    "verbosity": 0
}

d1_xgb_submission, oof_tuned, y = TrainML(XGBRegressor(**xgb_params), train_df, test_df)
track['d1_xgb_submission'] = {
    'test': d1_xgb_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:19<00:00,  3.85s/it]

Mean Train QWK --> 0.7605
Mean Validation QWK ---> 0.4131





----> || Optimized QWK SCORE :: [36m[1m 0.452[0m


In [16]:
cat_params = {
    'learning_rate': 0.05,
    'depth': 8,
    'iterations': 128,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'boost_from_average': False

}

d1_cat_submission, oof_tuned, y = TrainML(CatBoostRegressor(**cat_params), train_df, test_df)
track['d1_cat_submission'] = {
    'test': d1_cat_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:43<00:00,  8.76s/it]

Mean Train QWK --> 0.5412
Mean Validation QWK ---> 0.3802





----> || Optimized QWK SCORE :: [36m[1m 0.456[0m


In [17]:
def TrainMLYDF(learner, train_df, test_data):
    X = train_df.drop(['id', 'sii'], axis=1)
    y = train_df['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        X_train['sii'] = y_train

        if learner == 'GBT':
            model = ydf.GradientBoostedTreesLearner(
                label="sii", task=ydf.Task.REGRESSION, subsample=0.7,
                max_depth=6, use_hessian_gain=True,
                growing_strategy="BEST_FIRST_GLOBAL",
                num_trees=600
            ).train(X_train)
        elif learner == 'RF':
            model = ydf.RandomForestLearner(
                label="sii", task=ydf.Task.REGRESSION, num_trees=600
            ).train(X_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data[X.columns])
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.49, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    thresholds = KappaOPtimizer.x
    
    oof_tuned = threshold_Rounder(oof_non_rounded, thresholds)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, thresholds)
    
    submission = pd.DataFrame({
        'id': test_df['id'],
        'sii': tpTuned
    })

    return submission, oof_tuned, y

In [18]:
d1_ydf_gbt_submission, oof_tuned, y = TrainMLYDF('GBT', train_df, test_df)
track['d1_ydf_gbt_submission'] = {
    'test': d1_ydf_gbt_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:26<00:00,  5.37s/it]

Mean Train QWK --> 0.6812
Mean Validation QWK ---> 0.3845





----> || Optimized QWK SCORE :: [36m[1m 0.443[0m


In [19]:
d1_ydf_rf_submission, oof_tuned, y = TrainMLYDF('RF', train_df, test_df)
track['d1_ydf_rf_submission'] = {
    'test': d1_ydf_rf_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:59<00:00, 11.90s/it]

Mean Train QWK --> 0.7910
Mean Validation QWK ---> 0.3821





----> || Optimized QWK SCORE :: [36m[1m 0.457[0m


In [20]:
hgb = HistGradientBoostingRegressor(
    max_bins=255,
    learning_rate=0.1, 
    max_depth=6,
    random_state=42
)

d1_hgb_submission, oof_tuned, y = TrainML(hgb, train_df, test_df)
track['d1_hgb_submission'] = {
    'test': d1_hgb_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:06<00:00,  1.35s/it]

Mean Train QWK --> 0.8207
Mean Validation QWK ---> 0.4065





----> || Optimized QWK SCORE :: [36m[1m 0.451[0m


In [21]:
d1_submission = pd.concat([
    d1_lgbm_submission.set_index('id'),
    d1_xgb_submission.set_index('id'),
    d1_cat_submission.set_index('id'),
    d1_ydf_gbt_submission.set_index('id'),
    d1_ydf_rf_submission.set_index('id'),
    d1_hgb_submission.set_index('id')
], axis=1)

In [22]:
d1_submission

Unnamed: 0_level_0,sii,sii,sii,sii,sii,sii
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00008ff9,0,1,0,1,1,1
000fd460,0,0,0,0,0,0
00105258,0,0,0,0,0,1
00115b9f,0,0,0,0,0,0
0016bb22,0,1,0,1,1,1
001f3379,1,1,1,1,1,1
0038ba98,0,0,1,1,0,0
0068a485,0,0,0,0,1,0
0069fbed,1,2,1,1,1,1
0083e397,0,1,0,1,1,0


In [23]:
imputer_1 = IterativeImputer(LinearRegression(), random_state=SEED)
X, y = train_df.set_index('id').drop(['sii'], axis=1), train_df['sii']
X = pd.DataFrame(
    imputer_1.fit_transform(X), index=X.index, columns=X.columns
)
train_df2 = X.copy().reset_index()
train_df2['sii'] = y

test_df2 = test_df.set_index('id')

In [24]:
test_df2 = pd.DataFrame(
    imputer_1.transform(test_df2), index=test_df2.index, columns=test_df2.columns
)
test_df2 = test_df2.reset_index()

In [25]:
d2_lr_submission, oof_tuned, y = TrainML(LinearRegression(), train_df2, test_df2)

track['d2_lr_submission'] = {
    'test': d2_lr_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:00<00:00,  6.10it/s]

Mean Train QWK --> 0.4464
Mean Validation QWK ---> 0.3653





----> || Optimized QWK SCORE :: [36m[1m 0.440[0m


In [26]:
rf_params = {
    'n_estimators': 1024,
    'max_depth': 12,
    'min_samples_leaf': 8,
    'max_features': 0.5,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': 0
}
rf_model = RandomForestRegressor(**rf_params)
d2_rf_submission, oof_tuned, y = TrainML(rf_model, train_df2, test_df2)

track['d2_rf_submission'] = {
    'test': d2_rf_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [03:00<00:00, 36.19s/it]

Mean Train QWK --> 0.7211
Mean Validation QWK ---> 0.3770





----> || Optimized QWK SCORE :: [36m[1m 0.451[0m


In [27]:
xt_params = {
    'n_estimators': 1024,
    'max_depth': 6,
    #'min_samples_leaf': 8,
    'max_features': 0.8,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': 0
}

xt_model = ExtraTreesRegressor(**xt_params)
d2_xt_submission, oof_tuned, y = TrainML(xt_model, train_df2, test_df2)

track['d2_xt_submission'] = {
    'test': d2_xt_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:29<00:00,  5.96s/it]

Mean Train QWK --> 0.5156
Mean Validation QWK ---> 0.3738





----> || Optimized QWK SCORE :: [36m[1m 0.452[0m


In [28]:
lgbm_params2 = {
    "subsample_freq": 4,
    "subsample": 0.5,
    "num_leaves": 128,
    "n_estimators": 2048,
    "max_depth": 12,
    "max_bin": 31,
    "learning_rate": 0.01,
    "lambda_l2": 1,
    "lambda_l1": 10,
    "feature_fraction": 0.5,
    'deterministic': True,
    'force_col_wise': True,
    'random_state': SEED,
    'verbose': -1
}
d2_lgbm_submission, oof_tuned, y = TrainML(LGBMRegressor(**lgbm_params2), train_df2, test_df2)

track['d2_lgbm_submission'] = {
    'test': d2_lgbm_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:12<00:00,  2.58s/it]

Mean Train QWK --> 0.7728
Mean Validation QWK ---> 0.3992





----> || Optimized QWK SCORE :: [36m[1m 0.460[0m


In [29]:
xgb_params2  = {
    "subsample": 0.5,
    "reg_lambda": 10,
    "reg_alpha": 10,
    "n_estimators": 128,
    "min_child_weight": 5,
    "max_depth": 10,
    "max_bin": 127,
    "learning_rate": 0.05,
    "gamma": 0.1,
    "colsample_bytree": 0.6,
    "objective": "reg:squarederror",
    "tree_method": "hist",
    "random_state": SEED,
    "verbosity": 0
}

d2_xgb_submission, oof_tuned, y = TrainML(XGBRegressor(**xgb_params2), train_df2, test_df2)
track['d2_xgb_submission'] = {
    'test': d2_xgb_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [00:08<00:00,  1.61s/it]

Mean Train QWK --> 0.6870
Mean Validation QWK ---> 0.3899





----> || Optimized QWK SCORE :: [36m[1m 0.451[0m


In [236]:
d2_lgbm_submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,1


In [30]:
d2_ydf_gbt_submission, oof_tuned, y = TrainMLYDF('GBT', train_df2, test_df2)
track['d2_ydf_gbt_submission'] = {
    'test': d2_ydf_gbt_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y


d2_ydf_rf_submission, oof_tuned, y = TrainMLYDF('RF', train_df2, test_df2)
track['d2_ydf_rf_submission'] = {
    'test': d2_ydf_rf_submission,
    'oof': oof_tuned,
    'y': y
}

del oof_tuned, y

Training Folds: 100%|██████████| 5/5 [01:03<00:00, 12.77s/it]

Mean Train QWK --> 0.8478
Mean Validation QWK ---> 0.3730





----> || Optimized QWK SCORE :: [36m[1m 0.452[0m


In [78]:
final = pd.DataFrame()

In [79]:
final['id'] = test_df['id']

In [80]:
final.head()

Unnamed: 0,id
0,00008ff9
1,000fd460
2,00105258
3,00115b9f
4,0016bb22


In [81]:
for key, item in track.items():
    item['test'] = item['test'].rename(columns={
        'sii': key
    })

    final = final.merge(
        item['test'],
        on='id',
        how='left'
    )

In [82]:
from statistics import mode

final['sii'] = final.set_index('id').apply(
    lambda x: mode(x),
    axis=1
).values

In [83]:
final = final[['id', 'sii']]

In [84]:
final

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,1


In [85]:
final.to_csv('submission.csv', index=False)