In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import seaborn as sns

import gc

import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMClassifier
from lightgbm import Booster

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [2]:
TRAIN_0_4_PATH = "/kaggle/input/prepareddata/train_0_4.pkl"
TRAIN_5_12_PATH = "/kaggle/input/prepareddata/train_5_12.pkl"
TRAIN_13_22_PATH = "/kaggle/input/prepareddata/train_13_22.pkl"
TRAIN_LABELS_PATH = "/kaggle/input/predict-student-performance-from-game-play/train_labels.csv"

In [3]:
train_raw_0_4 = pd.read_pickle(TRAIN_0_4_PATH)
train_raw_5_12 = pd.read_pickle(TRAIN_5_12_PATH)
train_raw_13_22 = pd.read_pickle(TRAIN_13_22_PATH)

In [4]:
targets = pd.read_csv(TRAIN_LABELS_PATH)
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [5]:
NUM_COLS = ['level', 
            'page',
            'hover_duration',
            'room_coor_x', 'screen_coor_x',
            'room_coor_y', 'screen_coor_y',
            'delta_absolute_coor_x', 'delta_absolute_coor_y',
            'absolute_coor_x', 'absolute_coor_y',
            'radius_absolute_coor',
            'tangent_absolute_coor',
            'delta_radius_absolute_coor', 'delta_tangent_absolute_coor',
            'elapsed_time_diff']

CAT_COLS = ['full_event_name',
            "text",
            "event_name"]

FULL_EVENT_NAME = {
    "0_4": train_raw_0_4["full_event_name"].unique().tolist(),  
    "5_12": train_raw_5_12["full_event_name"].unique().tolist(),  
    "13_22": train_raw_13_22["full_event_name"].unique().tolist()  
}

SUB_LEVELS = {'0_4': [1, 2, 3, 4],
              '5_12': [5, 6, 7, 8, 9, 10, 11, 12],
              '13_22': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22]}

In [6]:
def createDeltaElapsedTimeFeatures(df:pd.DataFrame) -> pd.DataFrame:
    df['elapsed_time_diff'] = df['elapsed_time'].diff(1).fillna(0)
    df = df.drop('elapsed_time', axis=1)
    return df

def agg_by_elapsed_time_diff(col_name, iter_col, feature_suffix):
    agg = [
        *[pl.col(col_name).filter(pl.col(col_name) == c).count().alias(f"{c}_{col_name}_{feature_suffix}")
          for c in iter_col],
        *[pl.col("elapsed_time_diff").filter(pl.col(col_name) == c).std().alias(f"{c}_{col_name}_ET_std_{feature_suffix}") 
          for c in iter_col],
        *[pl.col("elapsed_time_diff").filter(pl.col(col_name) == c).mean().alias(f"{c}_{col_name}_ET_mean_{feature_suffix}") 
          for c in iter_col],
        *[pl.col("elapsed_time_diff").filter(pl.col(col_name) == c).sum().alias(f"{c}_{col_name}_ET_sum_{feature_suffix}") 
          for c in iter_col],
        *[pl.col("elapsed_time_diff").filter(pl.col(col_name) == c).median().alias(f"{c}_{col_name}_ET_median_{feature_suffix}") 
          for c in iter_col],
        *[pl.col("elapsed_time_diff").filter(pl.col(col_name) == c).max().alias(f"{c}_{col_name}_ET_max_{feature_suffix}") 
          for c in iter_col]
    ]
    return agg

def feature_engineer_pl(x, group, feature_suffix):  
     
    full_event_names = FULL_EVENT_NAME[group]
    levels = SUB_LEVELS[group]

    x = pl.from_pandas(x)
    aggs = [
        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique_{feature_suffix}") 
          for c in CAT_COLS],

        *[pl.col(c).mean().alias(f"{c}_mean_{feature_suffix}") 
          for c in NUM_COLS],
        *[pl.col(c).std().alias(f"{c}_std_{feature_suffix}") 
          for c in NUM_COLS],
        *[pl.col(c).min().alias(f"{c}_min_{feature_suffix}") 
          for c in NUM_COLS],
        *[pl.col(c).max().alias(f"{c}_max_{feature_suffix}") 
          for c in NUM_COLS],
        *[pl.col(c).median().alias(f"{c}_median_{feature_suffix}") 
          for c in NUM_COLS],
        *[pl.col(c).quantile(0.25).alias(f"{c}_q25_{feature_suffix}") 
          for c in NUM_COLS],
        *[pl.col(c).quantile(0.75).alias(f"{c}_q75_{feature_suffix}") 
          for c in NUM_COLS]]
    
    aggs.extend(agg_by_elapsed_time_diff("full_event_name", full_event_names, feature_suffix))
    aggs.extend(agg_by_elapsed_time_diff("level", levels, feature_suffix))

    df = x.groupby(['session_id'], maintain_order=True).agg(aggs).sort("session_id")
    return df.to_pandas()

def createTimeFeatures(df):
    df["month"] = df["session_id"].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8)
    df["day"] = df["session_id"].apply(lambda x: int(str(x)[4:6])).astype(np.uint8)
    df["hour"] = df["session_id"].apply(lambda x: int(str(x)[6:8])).astype(np.uint8)
    return df

In [7]:
def pipeline(df:pd.DataFrame, grp) -> pd.DataFrame:
    df = createDeltaElapsedTimeFeatures(df)
    df = feature_engineer_pl(df, grp, grp)
    df = createTimeFeatures(df)
    df = df.set_index('session_id')
    return df

In [8]:
df1 = pipeline(train_raw_0_4, "0_4")
df2 = pipeline(train_raw_5_12, "5_12")
df3 = pipeline(train_raw_13_22, "13_22")

In [9]:
ALL_USERS = df1.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 23562 users info


In [10]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_estimators': 400,
    'learning_rate': 0.02,
    'max_depth': 4,
    'alpha': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'random_state': 42
}

In [11]:
with open("/kaggle/input/features-boruta/BORUTA_FEATURES.csv", "r") as f:
    FEATURES_Q = [f.readline().split() for i in range(1,19)]

In [12]:
n_splits = 5
kf = KFold(n_splits=n_splits)

for q in range(1, 19):
    print(q)
    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    FEATURES = FEATURES_Q[q-1]
    if q <= 3:
        grp = '0-4'
        df = df1
    elif q <= 13:
        grp = '5-12'
        df = df2
    elif q <= 22:
        grp = '13-22'
        df = df3

    # TRAIN DATA
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        df_train = df.iloc[train_idx] #.reset_index(drop=True)
        train_users = df_train.index.values
        train_y = targets[targets['session'].isin(list(train_users))].loc[targets.q == q].set_index('session')

        df_val = df.iloc[val_idx] #.reset_index(drop=True)
        val_users = df_val.index.values
        val_y = targets[targets['session'].isin(list(val_users))].loc[targets.q == q].set_index('session')

        clf = LGBMClassifier(**lgb_params)
        clf.fit(df_train[FEATURES].astype('float32'), train_y['correct'], verbose=0)

        clf.booster_.save_model(f'LGBM_question{q}_fold{fold}.lgb')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


In [13]:
import jo_wilder_310 as jo_wilder
try:
    env = jo_wilder.make_env()
    iter_test = env.iter_test()
    print("env made!")
except:
    jo_wilder.make_env.__called__ = False
    type(env)._state = type(type(env)._state).__dict__['INIT']
    env = jo_wilder.make_env()
    iter_test = env.iter_test()
    print("env re made!")

env made!


In [14]:
models_list = [[Booster(model_file = f"/kaggle/working/LGBM_question{q}_fold{fold}.lgb"
) for fold in range(5)] for q in range(1, 19)]

In [15]:
def createPreparedFeatures(df:pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(by=['session_id', 'elapsed_time'])
    df["full_event_name"] = df["name"].astype("str") + "_" + df["event_name"].astype("str")

    df["absolute_coor_x"] = df["room_coor_x"] + df["screen_coor_x"]
    df["absolute_coor_y"] = df["room_coor_y"] + df["screen_coor_y"]

    df["delta_absolute_coor_x"] = df["absolute_coor_x"].diff(1).fillna(0)
    df["delta_absolute_coor_y"] = df["absolute_coor_y"].diff(1).fillna(0)

    df["radius_absolute_coor"] = np.sqrt(df["absolute_coor_x"]**2 + df["absolute_coor_y"]**2)
    df["tangent_absolute_coor"] = df["absolute_coor_y"]/df["absolute_coor_x"]

    df["delta_radius_absolute_coor"] = np.sqrt(df["delta_absolute_coor_x"]**2 + df["delta_absolute_coor_y"]**2)
    df["delta_tangent_absolute_coor"] = df["delta_absolute_coor_y"]/df["delta_absolute_coor_x"]

    df["page"] = df["page"].fillna(-1) + 1
    df["hover_duration"] = df["hover_duration"].fillna(0)
    return df

In [16]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}
grps = {'0-4':'0_4', '5-12':'5_12', '13-22':'13_22'}

count = 0
thresh = 0.62

samples = []

for (test, sample_submission) in iter_test:
    samples.append(sample_submission)
    grp = test.level_group.values[0]
    a,b = limits[grp]
    
    test = createPreparedFeatures(test)
    test = pipeline(test, grps[grp])
    
    for q in range(a,b):
        FEATURES = FEATURES_Q[q-1]
        
        model_0 = models_list[q-1][0]
        model_1 = models_list[q-1][1]
        model_2 = models_list[q-1][2]
        model_3 = models_list[q-1][3]
        model_4 = models_list[q-1][4]
        
        pred_0 = model_0.predict(test[FEATURES].astype(np.float32))
        pred_1 = model_1.predict(test[FEATURES].astype(np.float32))
        pred_2 = model_2.predict(test[FEATURES].astype(np.float32))
        pred_3 = model_3.predict(test[FEATURES].astype(np.float32))
        pred_4 = model_4.predict(test[FEATURES].astype(np.float32))
        
        pred = (pred_0 + pred_1 + pred_2 + pred_3 + pred_4) / 5
        mask = sample_submission.session_id.str.contains(f'q{q}')
        sample_submission.loc[mask,'correct'] = int( pred > thresh)
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [17]:
df = pd.read_csv('submission.csv')
print( df.shape )
df.head(60)

(54, 2)


Unnamed: 0,session_id,correct
0,20090109393214576_q1,0
1,20090109393214576_q2,1
2,20090109393214576_q3,1
3,20090109393214576_q4,1
4,20090109393214576_q5,0
5,20090109393214576_q6,1
6,20090109393214576_q7,1
7,20090109393214576_q8,0
8,20090109393214576_q9,1
9,20090109393214576_q10,0
