# Student Performance from Game Play Using TensorFlow Decision Forests

## Import the Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score

## Load the Dataset

In [2]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)

## Load the labels

In [3]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [4]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

# Prepare the dataset

In [5]:
CATEGORICAL = ['event_name', 'name', 'fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['level', 'page', 'hover_duration', 'location_x_diff', 'location_y_diff']

EVENTS = dataset_df['event_name'].unique()
NAMES = dataset_df['name'].unique()

DIFFS = ['location_x_diff', 'location_y_diff']

# Except: 'music', 'fullscreen', 'hq'

In [6]:
def feature_engineer(dataset_df):
    dataset_df['elapsed_time_diff'] = (
        (dataset_df['elapsed_time'] - dataset_df['elapsed_time'].shift(1))
        .fillna(0)
        .clip(lower=0, upper=1e9)
    )
    dataset_df['location_x_diff'] = (
        (dataset_df['room_coor_x'] - dataset_df['room_coor_x'].shift(1))
        .abs()
    )
    dataset_df['location_y_diff'] = (
        (dataset_df['room_coor_y'] - dataset_df['room_coor_y'].shift(1))
        .abs()
    )
    
    dfs = []
    
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
        
    for c in DIFFS:
        tmp = dataset_df.groupby(['session_id', 'level_group'])[c].sum()
        tmp.name = c + '_sum'
        dfs.append(tmp)
        
    for c in EVENTS: 
        dataset_df[c] = (dataset_df.event_name == c).astype('int8')
    for c in EVENTS: 
        tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])[c].sum()
        tmp.name = c + '_count'
        dfs.append(tmp)

    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [7]:
dataset_df = feature_engineer(dataset_df)

In [8]:
# corr = dataset_df.corr()

# plt.figure(figsize=(15,15)) 
# sns.heatmap(corr, annot=True, cmap='Blues')
# plt.title('Correlation Heatmap')
# plt.show()

In [9]:
FEATURES = [c for c in dataset_df.columns if c != 'level_group']
ALL_USERS = dataset_df.index.unique()

gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}
evaluation_dict ={}

In [10]:
ALL_USERS

Int64Index([20090312431273200, 20090312433251036, 20090312455206810,
            20090313091715820, 20090313571836404, 20090314035813970,
            20090314121766812, 20090314221187252, 20090314363702160,
            20090314441803444,
            ...
            22100213081672770, 22100213133089136, 22100215032067016,
            22100215190998610, 22100215241104530, 22100215342220508,
            22100215460321130, 22100217104993650, 22100219442786200,
            22100221145014656],
           dtype='int64', name='session_id', length=23562)

In [11]:
for i, (train_index, test_index) in enumerate(gkf.split(X=dataset_df, groups=dataset_df.index)):
    print('-'*25)
    print('    <<<  Fold ',i+1, ' >>>')
    print('-'*25)
    
    xgb_params = {
        'objective' : 'binary:logistic',
        'eval_metric':'logloss',
        'learning_rate': 0.01,
        'max_depth': 4,
        'n_estimators': 1500,
        'early_stopping_rounds': 50,
        'tree_method':'hist',
        'subsample':0.6,
        'colsample_bytree': 0.5,
        'use_label_encoder' : False
    }
    
    for q_no in range(1,19):

        # Select level group for the question based on the q_no.
        if q_no<=3: grp = '0-4'
        elif q_no<=13: grp = '5-12'
        elif q_no<=22: grp = '13-22'


        # TRAIN DATA
        train_x = dataset_df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
        
        # VALID DATA
        valid_x = dataset_df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

        # We will now create the Gradient Boosted Trees Model with default settings. 
        # By default the model is set to train for a classification task.
        xgb = XGBClassifier(**xgb_params)
        xgb.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[ (valid_x[FEATURES].astype('float32'), valid_y['correct']) ],
                verbose=0)
        print(f'{q_no}({xgb.best_ntree_limit}), ',end='')
        
        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{q_no}'] = xgb
        oof.loc[valid_users, q_no-1] = xgb.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]  
    print()

-------------------------
    <<<  Fold  1  >>>
-------------------------
1(530), 2(564), 3(671), 4(764), 5(591), 6(527), 7(385), 8(422), 9(793), 10(603), 11(346), 12(672), 13(664), 14(837), 15(904), 16(308), 17(272), 18(503), 
-------------------------
    <<<  Fold  2  >>>
-------------------------
1(781), 2(741), 3(566), 4(762), 5(704), 6(586), 7(599), 8(271), 9(585), 10(569), 11(495), 12(474), 13(390), 14(998), 15(807), 16(553), 17(459), 18(556), 
-------------------------
    <<<  Fold  3  >>>
-------------------------
1(662), 2(654), 3(542), 4(784), 5(563), 6(540), 7(617), 8(350), 9(478), 10(619), 11(464), 12(419), 13(471), 14(835), 15(716), 16(321), 17(583), 18(618), 
-------------------------
    <<<  Fold  4  >>>
-------------------------
1(630), 2(613), 3(494), 4(902), 5(503), 6(581), 7(488), 8(339), 9(511), 10(697), 11(462), 12(498), 13(639), 14(762), 15(664), 16(364), 17(341), 18(644), 
-------------------------
    <<<  Fold  5  >>>
-------------------------
1(756), 2(662)

In [12]:
true = oof.copy()
for k in range(18):
    # GET TRUE LABELS
    tmp = labels.loc[labels.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [13]:
# FIND BEST THRESHOLD TO CONVERT PROBS INTO 1s AND 0s
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.81,0.01):
    print(f'{threshold:.02f}, ',end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

0.40, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.80, 

In [14]:
for k in range(18):
        
    # COMPUTE F1 SCORE PER QUESTION
    m = f1_score(true[k].values, (oof[k].values>best_threshold).astype('int'), average='macro')
    print(f'Q{k}: F1 =',m)
    
# COMPUTE F1 SCORE OVERALL
m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print('==> Overall F1 =',m)

Q0: F1 = 0.598848270044749
Q1: F1 = 0.49463795470144134
Q2: F1 = 0.49111776892454156
Q3: F1 = 0.621941167226645
Q4: F1 = 0.5809118253917721
Q5: F1 = 0.606781114426069
Q6: F1 = 0.5904274556688486
Q7: F1 = 0.5358736725422728
Q8: F1 = 0.6007794381046897
Q9: F1 = 0.5212829163796026
Q10: F1 = 0.5881483852721858
Q11: F1 = 0.4959530862295987
Q12: F1 = 0.43102374577850194
Q13: F1 = 0.6077286849763348
Q14: F1 = 0.49787188894124296
Q15: F1 = 0.4648349760943598
Q16: F1 = 0.5287923082440193
Q17: F1 = 0.4898325664485556
==> Overall F1 = 0.678910546029193


In [15]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    # FEATURE ENGINEER TEST DATA
    df = feature_engineer(test)
    # INFER TEST DATA
    grp = test.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        xgb = models[f'{grp}_{t}']
        p = xgb.predict_proba(df[FEATURES].astype('float32'))[0,1]
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = int( p > best_threshold )
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
