In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
import os
from tqdm import tqdm
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor
from sklearn.preprocessing import StandardScaler, LabelEncoder
import h2o
from h2o.automl import H2OAutoML
from sklearn.base import clone
from sklearn.metrics import *
from colorama import Fore, Style

SEED = 42
n_splits = 5

In [26]:
train_df = pd.read_csv('child-mind-institute-problematic-internet-use/train.csv')
sample = pd.read_csv('child-mind-institute-problematic-internet-use/sample_submission.csv')

TARGET_COLS = [
    "PCIAT-Season",
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",    
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total"
]

train_df = train_df.drop(TARGET_COLS, axis=1)

test_df = pd.read_csv('child-mind-institute-problematic-internet-use/test.csv')
ids = test_df['id']

SEASON_COLS = ["Basic_Demos-Enroll_Season", "CGAS-Season", "Physical-Season",
    "Fitness_Endurance-Season", "FGC-Season", "BIA-Season",
    "PAQ_A-Season", "PAQ_C-Season", "SDS-Season", "PreInt_EduHx-Season"]

train_df = train_df.drop(SEASON_COLS, axis=1)
test_df = test_df.drop(SEASON_COLS, axis=1)

In [27]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname:process_file(fname, dirname), ids), total=len(ids)))

    stats,indexes = zip(*results)

    df = pd.DataFrame(stats, columns = [f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes

    return df

train_ts = load_time_series('child-mind-institute-problematic-internet-use/series_train.parquet')
test_ts = load_time_series('child-mind-institute-problematic-internet-use/series_test.parquet')
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

100%|██████████| 996/996 [00:57<00:00, 17.30it/s]
100%|██████████| 2/2 [00:00<00:00,  8.17it/s]


In [28]:
train_ts

Unnamed: 0,Stat_0,Stat_1,Stat_2,Stat_3,Stat_4,Stat_5,Stat_6,Stat_7,Stat_8,Stat_9,...,Stat_87,Stat_88,Stat_89,Stat_90,Stat_91,Stat_92,Stat_93,Stat_94,Stat_95,id
0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,...,4.568309,89.673332,0.0,2659.666748,4179.0,8.639500e+13,7.0,1.0,63.0,0d01bbf2
1,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,...,3.006919,89.322289,1.0,2648.000000,4181.0,8.639500e+13,7.0,3.0,37.0,cefdb7fe
2,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,...,4.491224,88.801147,1.0,1157.250000,4152.0,8.639500e+13,7.0,1.0,47.0,58391429
3,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,...,4.054967,89.521629,0.0,2648.500000,4181.0,8.639500e+13,7.0,3.0,67.0,2ca2206f
4,377160.0,377160.0,377160.0,377160.0,377160.0,377160.0,377160.0,377160.0,377160.0,377160.0,...,5.087605,89.960457,1.0,2408.199951,4133.0,8.639500e+13,7.0,1.0,31.0,19455336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,424140.0,424140.0,424140.0,424140.0,424140.0,424140.0,424140.0,424140.0,424140.0,424140.0,...,3.816627,89.300941,1.0,2216.800049,4192.0,8.639500e+13,7.0,2.0,46.0,43a7386d
992,72533.0,72533.0,72533.0,72533.0,72533.0,72533.0,72533.0,72533.0,72533.0,72533.0,...,2.631471,89.823715,0.0,2255.500000,4171.0,8.639500e+13,7.0,1.0,149.0,2840643b
993,401964.0,401964.0,401964.0,401964.0,401964.0,401964.0,401964.0,401964.0,401964.0,401964.0,...,4.914551,89.513603,1.0,2561.399902,4184.0,8.639500e+13,7.0,3.0,115.0,1b329556
994,401880.0,401880.0,401880.0,401880.0,401880.0,401880.0,401880.0,401880.0,401880.0,401880.0,...,5.241471,89.762779,1.0,2623.000000,4187.0,8.639500e+13,7.0,2.0,85.0,62b873a2


In [29]:
train = pd.merge(train_df, train_ts, how="left", on='id')
test = pd.merge(test_df, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
train = train.dropna(subset=['sii'])

In [30]:
train.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,Stat_86,Stat_87,Stat_88,Stat_89,Stat_90,Stat_91,Stat_92,Stat_93,Stat_94,Stat_95
count,2736.0,2736.0,2342.0,2527.0,2530.0,2572.0,483.0,2478.0,2486.0,2478.0,...,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0
mean,10.23867,0.364401,65.159266,19.125667,55.897051,87.839592,26.625259,69.755044,81.823411,117.127926,...,1.875645,3.674156,88.888246,0.650602,2335.652809,4180.122239,86289800000000.0,6.933735,2.615462,78.834337
std,3.427982,0.48135,11.81105,4.909305,7.394938,43.353079,5.230213,13.790203,13.769816,17.221707,...,0.911899,1.459785,3.313411,0.477019,898.044846,93.299368,898497100000.0,0.504877,1.158635,86.447984
min,5.0,0.0,25.0,0.0,36.0,0.0,19.0,11.0,27.0,49.0,...,0.167161,0.140138,11.565893,0.0,26.5,3996.0,69805000000000.0,2.0,1.0,-119.0
25%,8.0,0.0,59.0,15.773447,50.05,57.2,23.0,61.0,73.0,107.0,...,1.230632,2.790156,88.972979,0.0,2512.600098,4170.0,86395000000000.0,7.0,2.0,30.0
50%,10.0,0.0,65.0,17.81901,55.0,75.8,26.0,68.0,81.0,114.0,...,1.740934,3.806256,89.377281,1.0,2613.625,4180.0,86395000000000.0,7.0,3.0,51.0
75%,12.0,1.0,75.0,21.172311,61.75,111.45,29.0,76.0,91.0,125.0,...,2.234637,4.593709,89.651743,1.0,2637.0,4187.0,86395000000000.0,7.0,4.0,90.0
max,22.0,1.0,95.0,46.102914,78.5,315.0,50.0,179.0,138.0,203.0,...,8.125557,11.3262,89.98114,1.0,20445.5,6000.0,86395000000000.0,7.0,4.0,748.0


In [31]:
def preprocess_data(df,train_data=False):
    # 利用中位数填充数值类na
    num_cols = df.select_dtypes(include=np.number).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    
    # 用最高频率项填充枚举项na
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0]) 
     
    return df

train = preprocess_data(train)
test = preprocess_data(test)

train = pd.DataFrame(train)
test = pd.DataFrame(test)

In [32]:
def feature_engineering(df):
    df['Physical-BMI_Height_Ratio'] = df['Physical-BMI'] / df['Physical-Height']
    df['Physical-Weight_Height_Ratio'] = df['Physical-Weight'] / df['Physical-Height']
    df['Physical-BMI_Squared'] = df['Physical-BMI'] ** 2
    df['Physical-Height_Squared'] = df['Physical-Height'] ** 2
    df['Physical-Waist_Circumference_Squared'] = df['Physical-Waist_Circumference'] ** 2
    df['Physical-HeartRate_Squared'] = df['Physical-HeartRate'] ** 2
    df['FitnessGram_Total'] = df['Fitness_Endurance-Time_Mins'] + df['FGC-FGC_CU'] + df['FGC-FGC_GSND'] + df['FGC-FGC_GSD'] + df['FGC-FGC_PU'] + df['FGC-FGC_SRL'] + df['FGC-FGC_SRR'] + df['FGC-FGC_TL']
    df['FitnessGram_ZoneTotal'] = df['Fitness_Endurance-Max_Stage'] + df['FGC-FGC_CU_Zone'] + df['FGC-FGC_GSND_Zone'] + df['FGC-FGC_GSD_Zone'] + df['FGC-FGC_PU_Zone'] + df['FGC-FGC_SRL_Zone'] + df['FGC-FGC_SRR_Zone'] + df['FGC-FGC_TL_Zone']
    return df

train = feature_engineering(train)
test = feature_engineering(test)

In [33]:
train.head()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,Stat_94,Stat_95,Physical-BMI_Height_Ratio,Physical-Weight_Height_Ratio,Physical-BMI_Squared,Physical-Height_Squared,Physical-Waist_Circumference_Squared,Physical-HeartRate_Squared,FitnessGram_Total,FitnessGram_ZoneTotal
0,5,0,51.0,16.877316,46.0,50.8,26.0,68.0,81.0,114.0,...,3.0,51.0,0.366898,1.104348,284.843785,2116.0,676.0,6561.0,66.2,10.0
1,9,0,65.0,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,...,3.0,51.0,0.292408,0.958333,196.997795,2304.0,484.0,4900.0,80.2,11.0
2,10,1,71.0,16.648696,56.5,75.6,26.0,65.0,94.0,117.0,...,3.0,51.0,0.294667,1.338053,277.17908,3192.25,676.0,8836.0,83.9,12.0
3,9,0,71.0,18.292347,56.0,81.6,26.0,60.0,97.0,117.0,...,3.0,85.0,0.326649,1.457143,334.609957,3136.0,676.0,9409.0,93.2,12.0
5,13,1,50.0,22.279952,59.5,112.2,26.0,60.0,73.0,102.0,...,3.0,91.0,0.374453,1.885714,496.39626,3540.25,676.0,5329.0,88.4,11.0


In [35]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0, 
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(models_class, test_data, train_data):
    X = train_data.drop(['sii'], axis=1)
    y = train_data['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    train_S = []
    test_S = []

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X,y), desc="Training Folds", total = n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        X_train = h2o.H2OFrame(X_train)
        y_train_pred = best_model.predict(X_train)
        y_train_pred = y_train_pred.as_data_frame()
        y_train_pred = y_train_pred.values
        y_train_pred = y_train_pred.reshape(-1)

        X_val = h2o.H2OFrame(X_val)
        y_val_pred = best_model.predict(X_val)
        y_val_pred = y_val_pred.as_data_frame()
        y_val_pred = y_val_pred.values
        y_val_pred = y_val_pred.reshape(-1)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round[0].astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        test_data_copy = h2o.H2OFrame(test_data)
        test_data_copy = best_model.predict(test_data_copy)
        test_data_copy = test_data_copy.as_data_frame()
        test_data_copy = test_data_copy.values
        test_data_copy = test_data_copy.reshape(-1)
        test_preds[:, fold] = test_data_copy

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
        
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions, x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), method='Nelder-Mead') # Nelder-Mead | # Powell
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission,KappaOPtimizer
        

In [None]:
h2o.init()
train_data = h2o.H2OFrame(train)

aml = H2OAutoML(max_runtime_secs=5400,seed=5)
aml.train(y='sii', training_frame=train_data)

In [None]:
leaderboard = aml.leaderboard

In [None]:
best_model = aml.leader
Submission,KappaOPtimizer = TrainML(best_model,test,train)
print(KappaOPtimizer.x)

In [None]:
Submission.to_csv('submission.csv', index=False)

In [None]:
print(Submission['sii'].value_counts())
Submission.head(20)