# Student Performance from Game Play Using TensorFlow Decision Forests

## Import the Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score

## Load the Dataset

In [2]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    
    'hover_duration':np.float32,
    
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',

    'fullscreen': np.int32,
    'hq': np.int32,
    'music': np.int32,

    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)

## Load the labels

In [3]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [4]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

# Prepare the dataset

In [5]:
CATEGORICAL = ['event_name', 'name', 'fqid', 'room_fqid', 'text_fqid']
# NUMERICAL = ['page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
#             'hover_duration', 'elapsed_time_diff']

NUMERICAL = ['page', 'location_x_diff', 'location_y_diff','hover_duration']

OPTIONAL = ['music', 'fullscreen', 'hq']

# Lots..
NAMES = dataset_df['name'].unique()
EVENTS = dataset_df['event_name'].unique()

TEXT_LISTS = dataset_df['text'].unique()
FQID_LISTS = dataset_df['fqid'].unique()

In [6]:
# df.fillna(df.mean()), df.where(pd.notnull(df), df.mean(), axis='columns')
def feature_engineer(dataset_df):
    dataset_df['elapsed_time_diff'] = (
        (dataset_df['elapsed_time'] - dataset_df['elapsed_time'].shift(1))
        .fillna(0)
        .clip(lower=0, upper=1e9)
    )
    dataset_df['location_x_diff'] = (
        (dataset_df['room_coor_x'] - dataset_df['room_coor_x'].shift(1))
        .abs()
    )
    dataset_df['location_y_diff'] = (
        (dataset_df['room_coor_y'] - dataset_df['room_coor_y'].shift(1))
        .abs()
    )
    
    dfs = []
    
    # NUMERICAL
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('max')
        tmp.name = tmp.name + '_max'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('min')
        tmp.name = tmp.name + '_min'
        dfs.append(tmp)
    
    # EVNETS
    for c in EVENTS: 
        dataset_df[c] = (dataset_df.event_name == c).astype('int8')
    for c in EVENTS: 
        tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])[c].sum()
        tmp.name = c + '_count'
        dfs.append(tmp)
    for c in EVENTS: 
        tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])[c].std()
        tmp.name = c + '_std'
        dfs.append(tmp)
    for c in EVENTS: 
        tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])[c].mean()
        tmp.name = c + '_mean'
        dfs.append(tmp)
    for c in EVENTS: 
        tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])[c].max()
        tmp.name = c + '_max'
        dfs.append(tmp)
    for c in EVENTS: 
        tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])[c].min()
        tmp.name = c + '_min'
        dfs.append(tmp)
        
    for c in EVENTS:
        tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])['elapsed_time_diff'].agg('mean')
        tmp.name = c + '_time_average'
        dfs.append(tmp)
        
        
    #OPTIONALS
#     for c in OPTIONAL:
#         tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
#         tmp.name = tmp.name + '_mean'
#         dfs.append(tmp)
#     for c in OPTIONAL:
#         tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
#         tmp.name = tmp.name + '_std'
#         dfs.append(tmp)
#     for c in OPTIONAL:
#         tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('sum')
#         tmp.name = tmp.name + '_sum'
#         dfs.append(tmp)
#     for c in OPTIONAL:
#         tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('min')
#         tmp.name = tmp.name + '_min'
#         dfs.append(tmp)
#     for c in OPTIONAL:
#         tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('max')
#         tmp.name = tmp.name + '_max'
#         dfs.append(tmp)   
        
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [7]:
dataset_df = feature_engineer(dataset_df)

In [8]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70686 entries, 20090312431273200 to 22100221145014656
Data columns (total 87 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   level_group                      70686 non-null  object 
 1   page_mean                        70686 non-null  float64
 2   location_x_diff_mean             70686 non-null  float32
 3   location_y_diff_mean             70686 non-null  float32
 4   hover_duration_mean              70686 non-null  float32
 5   page_std                         70686 non-null  float64
 6   location_x_diff_std              70686 non-null  float64
 7   location_y_diff_std              70686 non-null  float64
 8   hover_duration_std               70686 non-null  float64
 9   page_sum                         70686 non-null  float64
 10  location_x_diff_sum              70686 non-null  float32
 11  location_y_diff_sum              70686 non-null  flo

In [9]:
# corr = dataset_df.corr()

# plt.figure(figsize=(15,15)) 
# sns.heatmap(corr, annot=True, cmap='Blues')
# plt.title('Correlation Heatmap')
# plt.show()

In [10]:
FEATURES = [c for c in dataset_df.columns if c != 'level_group']
ALL_USERS = dataset_df.index.unique()

gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}
evaluation_dict ={}

In [11]:
# ALL_USERS

In [12]:
for i, (train_index, test_index) in enumerate(gkf.split(X=dataset_df, groups=dataset_df.index)):
    print('-'*25)
    print('    <<<  Fold ',i+1, ' >>>')
    print('-'*25)
    
    # default: 0.684 ~ 0.685
    xgb_params = {
        'objective' : 'binary:logistic',
        'eval_metric':'logloss',
        'learning_rate': 0.02,
        'max_depth': 5,
        'n_estimators': 1000,
        'early_stopping_rounds': 50,
        'tree_method':'hist',
        'subsample':0.8,
        'colsample_bytree': 0.4,
        'use_label_encoder' : False
    }

    xgb_params_1 = {
        'objective' : 'binary:logistic',
        'eval_metric':'logloss',
        'learning_rate': 0.01,
        'max_depth': 5,
        'n_estimators': 1000,
        'early_stopping_rounds': 50,
        'tree_method':'hist',
        'subsample':0.5,
        'colsample_bytree': 1,
        'use_label_encoder' : False,
        
        ### Parameters Tuning.. 
        'gamma': 5,
        'min_child_weight': 10,
        'colsample_bytree': 0.5
    }
    
    xgb_params_2 = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': 'binary:logistic',
        'eval_metric':'logloss',
        'learning_rate': 0.02,
        'alpha': 8,
        'max_depth': 4,
        'subsample':0.8,
        'colsample_bytree': 0.5,
        'seed': 42
    }
    
    for q_no in range(1,19):

        # Select level group for the question based on the q_no.
        if q_no<=3: grp = '0-4'
        elif q_no<=13: grp = '5-12'
        elif q_no<=22: grp = '13-22'


        # TRAIN DATA
        train_x = dataset_df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values

        train_y = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
        
        # VALID DATA
        valid_x = dataset_df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

        # We will now create the Gradient Boosted Trees Model with default settings. 
        # By default the model is set to train for a classification task.
        xgb = XGBClassifier(**xgb_params)
        xgb.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[ (valid_x[FEATURES].astype('float32'), valid_y['correct']) ],
                verbose=0)
        print(f'{q_no}({xgb.best_ntree_limit}), ',end='')
        
        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{q_no}'] = xgb
        oof.loc[valid_users, q_no-1] = xgb.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]  
    print()

-------------------------
    <<<  Fold  1  >>>
-------------------------
1(512), 2(366), 3(351), 4(350), 5(351), 6(271), 7(254), 8(240), 9(419), 10(575), 11(248), 12(511), 13(393), 14(359), 15(646), 16(191), 17(134), 18(341), 
-------------------------
    <<<  Fold  2  >>>
-------------------------
1(453), 2(358), 3(438), 4(452), 5(275), 6(370), 7(261), 8(172), 9(306), 10(281), 11(370), 12(278), 13(307), 14(363), 15(583), 16(265), 17(335), 18(354), 
-------------------------
    <<<  Fold  3  >>>
-------------------------
1(443), 2(334), 3(364), 4(310), 5(372), 6(211), 7(314), 8(177), 9(368), 10(296), 11(322), 12(189), 13(258), 14(423), 15(431), 16(237), 17(286), 18(393), 
-------------------------
    <<<  Fold  4  >>>
-------------------------
1(364), 2(325), 3(328), 4(429), 5(369), 6(336), 7(372), 8(285), 9(427), 10(336), 11(301), 12(253), 13(317), 14(321), 15(397), 16(207), 17(138), 18(468), 
-------------------------
    <<<  Fold  5  >>>
-------------------------
1(560), 2(348)

In [13]:
true = oof.copy()
for k in range(18):
    # GET TRUE LABELS
    tmp = labels.loc[labels.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [14]:
# FIND BEST THRESHOLD TO CONVERT PROBS INTO 1s AND 0s
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.81,0.01):
    print(f'{threshold:.02f}, ',end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

0.40, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.80, 

In [15]:
for k in range(18):
        
    # COMPUTE F1 SCORE PER QUESTION
    m = f1_score(true[k].values, (oof[k].values>best_threshold).astype('int'), average='macro')
    print(f'Q{k}: F1 =',m)
    
# COMPUTE F1 SCORE OVERALL
m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print('==> Overall F1 =',m)

Q0: F1 = 0.6605540452030019
Q1: F1 = 0.498457009461083
Q2: F1 = 0.5092449749215908
Q3: F1 = 0.649985755748627
Q4: F1 = 0.5942457199551855
Q5: F1 = 0.6156240547042747
Q6: F1 = 0.5982360760359031
Q7: F1 = 0.5466537144682196
Q8: F1 = 0.6086523696929441
Q9: F1 = 0.5384789825842065
Q10: F1 = 0.5996771868102845
Q11: F1 = 0.5051779634002194
Q12: F1 = 0.45114813188153485
Q13: F1 = 0.6187648251715543
Q14: F1 = 0.5457819916460864
Q15: F1 = 0.47308277784152536
Q16: F1 = 0.5505850488933345
Q17: F1 = 0.49396030134730734
==> Overall F1 = 0.6863881059405351


In [16]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    # FEATURE ENGINEER TEST DATA
    df = feature_engineer(test)
    # INFER TEST DATA
    grp = test.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        xgb = models[f'{grp}_{t}']
        p = xgb.predict_proba(df[FEATURES].astype('float32'))[0,1]
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = int( p > best_threshold )
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
