# Student Performance from Game Play Using TensorFlow Decision Forests

## Import the Required Libraries

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_decision_forests as tfdf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score

## Load the Dataset

In [2]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)

## Load the labels

In [3]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [4]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

# Prepare the dataset

In [5]:
# CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
# NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
#         'screen_coor_x', 'screen_coor_y', 'hover_duration']
# # EVENTS = dataset_df['event_name'].unique()

In [6]:
# # Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

# def feature_engineer(dataset_df):
#     dfs = []
#     for c in CATEGORICAL:
#         tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
#         tmp.name = tmp.name + '_nunique'
#         dfs.append(tmp)
#     for c in NUMERICAL:
#         tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
#         dfs.append(tmp)
#     for c in NUMERICAL:
#         tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
#         tmp.name = tmp.name + '_std'
#         dfs.append(tmp)
# #     for c in EVENTS: 
# #         dataset_df[c] = (dataset_df.event_name == c).astype('int8')
# #     for c in EVENTS:
# #         tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])['elapsed_time_diff'].agg('mean')
# #         tmp.name = c + '_time_average'
# #         dfs.append(tmp)
#     dataset_df = pd.concat(dfs,axis=1)
#     dataset_df = dataset_df.fillna(-1)
#     dataset_df = dataset_df.reset_index()
#     dataset_df = dataset_df.set_index('session_id')
#     return dataset_df

In [7]:
# dataset_df = feature_engineer(dataset_df)
# print("Full prepared dataset shape is {}".format(dataset_df.shape))

In [8]:
tmp = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv',usecols=[0])
tmp = tmp.groupby('session_id').session_id.agg('count')

PIECES = 10
CHUNK = int( np.ceil(len(tmp)/PIECES) ) #--> 2357 records

reads = [] #--> prepare empty list
skips = [0] #-->start add 0 because at the first time no need to skip record
for k in range(PIECES): #--> k = 0 - 9
    a = k*CHUNK #--> 0 * 2357 = 0 this for start from record session_id 0
    b = (k+1)*CHUNK #--> 1 * 2357 = 2357 this for end record session_id at 2357
    if b>len(tmp): b=len(tmp) #--> check b (2357) > 2357 or not this try to make b always increase step = 2357
    r = tmp.iloc[a:b].sum() #--> slide by range of records
    reads.append(r) #--> append number of records each time
    skips.append(skips[-1]+r) # append cummulative each time

print(reads)

[2684191, 2631991, 2638304, 2657670, 2644229, 2629801, 2596616, 2602258, 2619995, 2591891]


In [9]:
df_train_resize = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', nrows=reads[0])
print('Train size of first piece:', df_train_resize.shape )
df_train_resize.head()

Train size of first piece: (2684191, 20)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [10]:
import gc
gc.collect()
df_train_resize.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [11]:
def feature_engineer(train):
    dfs = []
    tmp = train.groupby(['session_id', 'level_group']).elapsed_time.agg({'min', 'max'})
    tmp['time_diff'] = tmp['max']-tmp['min']
    dfs.append(tmp['time_diff'])
    for c in new_features_columns:
        tmp[f'sum_{c}'] = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        dfs.append(tmp[f'sum_{c}'])
    tmp = train.groupby(['session_id', 'level_group']).event_comb.agg('nunique')
    tmp.name = tmp.name + '_nunique'
    dfs.append(tmp)

    df = pd.concat(dfs, axis=1)
    df = df.fillna(-1) # fill na with -1 
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [12]:
def cutscene_click(data):
    if data == 'cutscene_click':
        return True
    return False
def person_click(data):
    if data == 'person_click':
        return True
    return False
def navigate_click(data):
    if data == 'navigate_click':
        return True
    return False
def observation_click(data):
    if data == 'observation_click':
        return True
    return False
def notification_click(data):
    if data == 'notification_click':
        return True
    return False
def object_click(data):
    if data == 'object_click':
        return True
    return False
def object_hover(data):
    if data == 'object_hover':
        return True
    return False
def map_hover(data):
    if data == 'map_hover':
        return True
    return False
def map_click(data):
    if data == 'map_click':
        return True
    return False
def checkpoint(data):
    if data == 'checkpoint':
        return True
    return False
def notebook_click(data):
    if data == 'notebook_click':
        return True
    return False

In [13]:
# Change data from quanlitative to quantitative by use dummies
dummies = pd.get_dummies(df_train_resize['event_name'])

In [14]:
dummies.columns

Index(['checkpoint', 'cutscene_click', 'map_click', 'map_hover',
       'navigate_click', 'notebook_click', 'notification_click',
       'object_click', 'object_hover', 'observation_click', 'person_click'],
      dtype='object')

In [15]:
df_train_resize = pd.concat([df_train_resize, dummies], axis=1) # set axis = 1 for concat in columns

In [16]:
new_features_columns = ['cutscene_click',
'person_click',
'navigate_click',
'observation_click',
'notification_click',
'object_click',
'object_hover',
'map_hover',
'map_click',
'checkpoint',
'notebook_click']

In [17]:
all_pieces = []
print(f'Processing train as {PIECES} pieces to avoid memory error...')
for k in range(PIECES):
    print(k, ', ',end='')
    SKIPS = 0
    if k>0: SKIPS = range(1, skips[k]+1) #--> for skiprows parameter start at 1 and if need use range must + 1 because this will exclude end range number
    train = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv'
    , nrows=reads[k], skiprows=SKIPS).sort_values(['session_id', 'elapsed_time'])
    train['event_comb'] = train['event_name'] + '_' + train['name']
    df_dummies = pd.get_dummies(train['event_name'])
    room_fqid_encode = pd.get_dummies(train['room_fqid']).rename(columns=lambda x: x.replace('.','_'))
    event_comb_encode = pd.get_dummies(train['event_comb']).rename(columns=lambda x: x.replace('.','_'))
    train = pd.concat([train, df_dummies, room_fqid_encode, event_comb_encode], axis=1)
    
    train['cutscene_click'] = train['event_name'].apply(lambda x: cutscene_click(x))
    train['person_click'] = train['event_name'].apply(lambda x: person_click(x))
    train['navigate_click'] = train['event_name'].apply(lambda x: navigate_click(x))
    train['observation_click'] = train['event_name'].apply(lambda x: observation_click(x))
    train['notification_click'] = train['event_name'].apply(lambda x: notification_click(x))
    train['object_click'] = train['event_name'].apply(lambda x: object_click(x))
    train['object_hover'] = train['event_name'].apply(lambda x: object_hover(x))
    train['map_hover'] = train['event_name'].apply(lambda x: map_hover(x))
    train['map_click'] = train['event_name'].apply(lambda x: map_click(x))
    train['checkpoint'] = train['event_name'].apply(lambda x: checkpoint(x))
    train['notebook_click'] = train['event_name'].apply(lambda x: notebook_click(x))
    
    df = feature_engineer(train)
    all_pieces.append(df)
    
# Concatenate all pieces
print('\n')
del train; gc.collect()
df_tr = pd.concat(all_pieces, axis=0)
print('Shape of all train data after feature engineering:', df_tr.shape)
df_tr.head()

Processing train as 10 pieces to avoid memory error...
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 

Shape of all train data after feature engineering: (70686, 14)


Unnamed: 0_level_0,level_group,time_diff,sum_cutscene_click,sum_person_click,sum_navigate_click,sum_observation_click,sum_notification_click,sum_object_click,sum_object_hover,sum_map_hover,sum_map_click,sum_checkpoint,sum_notebook_click,event_comb_nunique
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
20090312431273200,0-4,194860,28,22,81,4,8,11,4,4,2,1,0,12
20090312431273200,13-22,435947,60,123,170,3,10,20,13,14,6,1,0,12
20090312431273200,5-12,277750,12,104,103,1,9,28,21,9,8,1,0,13
20090312433251036,0-4,233752,36,18,49,2,5,15,5,3,3,1,2,14
20090312433251036,13-22,2638851,65,145,637,5,14,83,66,186,45,1,50,19


In [18]:
df_tr.shape

(70686, 14)

In [19]:
# Group k Fold split data
ALL_USERS = df_tr.index.unique()

gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS) # set all users to 18 columns as each question & set index to session_id
model = {}

In [20]:
oof.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20090312431273200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312433251036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312455206810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090313091715820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090313571836404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
FEATURES = [c for c in df_tr.columns if c != 'level_group']

In [22]:
final_list = []
for i, (train_idex, test_index) in enumerate(gkf.split(X= df_tr, groups=df_tr.index)): # we already define index (sesion_id) for group to split here.
    # Here we know which fold contains which index, then use index to slice data in each fold.
    print('-'*25)
    print('--- Fold', i+1)
    print('-'*25)

    xgb_params = {
        'objective' : 'binary:logistic',
        'eval_metric' : 'logloss',
        'learning_rate' : 0.05,
        'max_depth' : 4,
        'n_estimators' : 1000,
        'early_stopping_rounds' : 50,
        'tree_method' : 'hist',
        'subsample' : 0.8,
        'colsample_bytree' : 0.4,
        'use_label_encoder' : False
    }

    # Iterate thru questions 1 thru 18
    # Create data set for train each question
    for t in range(1,19):
        print(t, ',', end= ' ')

        # Use this train data with these questions
        # clarify each question to assign group for slice data in df
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'

        # Train data
        train_x = df_tr.iloc[train_idex] # get train from index in each fold
        train_x = train_x.loc[train_x.level_group == grp] # get train only each question that match group, slice this again because data in level session_id & level_grop
        train_users = train_x.index.values # get all users train in each fold
        train_y = labels.loc[labels.q==t].set_index('session').loc[train_users] # get train variable y in each question.
        

        # Valid data
        valid_x = df_tr.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = labels.loc[labels.q==t].set_index('session').loc[valid_users]
        
#         Train XGBosst
        clf = XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
            eval_set=[(valid_x[FEATURES].astype('float32'), valid_y['correct'].astype('float32'))],
            verbose=0
        )
        print(f'{t}({clf.best_ntree_limit}), ', end='')
        

        # Save model
        model[f'{grp}_{t}'] = clf # Set model name
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1] # Assign predict value to oof table in each question
        # above out put is list such as [0.45779951 0.54220049] position 1 (class 0) is 0.45 position 2 (class 1) is 0.54 total is 1
        # we use [:, 1] because we predict correct = 1 is class 1 then set to [:, 1] instead of [:, 0]
print()

-------------------------
--- Fold 1
-------------------------
1 , 1(198), 2 , 2(103), 3 , 3(215), 4 , 4(206), 5 , 5(112), 6 , 6(114), 7 , 7(92), 8 , 8(104), 9 , 9(109), 10 , 10(120), 11 , 11(92), 12 , 12(194), 13 , 13(185), 14 , 14(156), 15 , 15(131), 16 , 16(61), 17 , 17(53), 18 , 18(122), -------------------------
--- Fold 2
-------------------------
1 , 1(178), 2 , 2(174), 3 , 3(147), 4 , 4(195), 5 , 5(88), 6 , 6(107), 7 , 7(114), 8 , 8(69), 9 , 9(122), 10 , 10(178), 11 , 11(119), 12 , 12(131), 13 , 13(137), 14 , 14(146), 15 , 15(242), 16 , 16(98), 17 , 17(71), 18 , 18(113), -------------------------
--- Fold 3
-------------------------
1 , 1(179), 2 , 2(222), 3 , 3(118), 4 , 4(136), 5 , 5(142), 6 , 6(110), 7 , 7(146), 8 , 8(110), 9 , 9(95), 10 , 10(137), 11 , 11(75), 12 , 12(74), 13 , 13(79), 14 , 14(170), 15 , 15(188), 16 , 16(68), 17 , 17(103), 18 , 18(106), -------------------------
--- Fold 4
-------------------------
1 , 1(142), 2 , 2(170), 3 , 3(108), 4 , 4(172), 5 , 5(125),

In [23]:
def assign_group(num):
    if num <= 3:
        return '0-4'
    if num <= 13:
        return '5-12'
    if num <= 22:
        return '13-22'

In [24]:
df_labels_with_group = labels.copy()
df_labels_with_group['level_group'] = labels['q'].apply(lambda x: assign_group(x))

In [25]:
df_labels_with_group.shape

(424116, 5)

In [26]:
final_df = df_labels_with_group.merge(df_tr.reset_index(), left_on=['session', 'level_group'], right_on=['session_id', 'level_group'])

In [27]:
final_df.head()

Unnamed: 0,session_id_x,correct,session,q,level_group,session_id_y,time_diff,sum_cutscene_click,sum_person_click,sum_navigate_click,sum_observation_click,sum_notification_click,sum_object_click,sum_object_hover,sum_map_hover,sum_map_click,sum_checkpoint,sum_notebook_click,event_comb_nunique
0,20090312431273200_q1,1,20090312431273200,1,0-4,20090312431273200,194860,28,22,81,4,8,11,4,4,2,1,0,12
1,20090312431273200_q2,1,20090312431273200,2,0-4,20090312431273200,194860,28,22,81,4,8,11,4,4,2,1,0,12
2,20090312431273200_q3,1,20090312431273200,3,0-4,20090312431273200,194860,28,22,81,4,8,11,4,4,2,1,0,12
3,20090312433251036_q1,0,20090312433251036,1,0-4,20090312433251036,233752,36,18,49,2,5,15,5,3,3,1,2,14
4,20090312433251036_q2,1,20090312433251036,2,0-4,20090312433251036,233752,36,18,49,2,5,15,5,3,3,1,2,14


In [28]:
# Get feature that important
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
features = [i for i in final_df.columns if i not in ['session_id_x', 'level_group', 'session_id_y', 'correct', 'session', 'question_num']]
fs = SelectKBest(score_func=f_classif, k=6)
fs.fit_transform(final_df[features], final_df['correct'])
cols_idxs = fs.get_support(indices=True)
features_df_new = final_df[features].iloc[:,cols_idxs]
features_df_new.head()

Unnamed: 0,sum_person_click,sum_navigate_click,sum_object_click,sum_object_hover,sum_map_hover,sum_map_click
0,22,81,11,4,4,2
1,22,81,11,4,4,2
2,22,81,11,4,4,2
3,18,49,15,5,3,3
4,18,49,15,5,3,3


In [29]:
# Put true labels into dataframe with 18 columns
actual = oof.copy()
actual.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20090312431273200,0.814023,0.992358,0.96248,0.80396,0.59888,0.843232,0.808693,0.658109,0.752208,0.589045,0.752998,0.871474,0.385429,0.800808,0.601601,0.80769,0.772309,0.983241
20090312433251036,0.653625,0.984896,0.951486,0.843884,0.499431,0.678912,0.722279,0.600522,0.714446,0.417441,0.549464,0.839775,0.174527,0.230233,0.100218,0.618196,0.596327,0.861682
20090312455206810,0.780547,0.984707,0.952548,0.886103,0.731598,0.86397,0.875717,0.73929,0.881353,0.659811,0.809511,0.910012,0.560343,0.812406,0.412442,0.816572,0.757811,0.938821
20090313091715820,0.609834,0.980248,0.935278,0.83228,0.569337,0.796561,0.758157,0.588645,0.733141,0.512893,0.656029,0.867912,0.219691,0.781502,0.546614,0.743575,0.714701,0.971547
20090313571836404,0.88807,0.997666,0.976217,0.935257,0.764201,0.931746,0.886661,0.773056,0.890347,0.696738,0.785812,0.927202,0.477464,0.799701,0.458151,0.765469,0.78287,0.978846


In [30]:
for k in range(18):
    # Get actual labels
    tmp = labels.loc[labels.q==k+1].set_index('session').loc[ALL_USERS]
    actual[k] = tmp.correct.values

In [31]:
actual.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20090312431273200,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
20090312433251036,0,1,1,1,0,1,1,0,1,0,0,1,0,1,0,1,0,1
20090312455206810,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1
20090313091715820,0,1,1,1,1,0,1,1,1,0,0,1,0,1,0,1,1,1
20090313571836404,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1


In [32]:
from sklearn.metrics import f1_score
# Find best threshold to convert probs into 1s and 0s
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4, 0.81, 0.01): # np.arange(start, end(but not include), step) 
    # but why use range at 0.4 - 0.8? --> i assume that proper range to identify that good or bad threshold
    print(f'{threshold:.02f}, ', end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int') #get all pred values and check if > threshold then true and covert true to int vice versa with false
    m = f1_score(actual.values.reshape((-1)).astype('int'), preds, average='macro') # make sure that actual & predict same type if not will cause error 
    scores.append(m) # collect data because this use for plot
    thresholds.append(threshold) # collect data because this use for plot
    # this part below just collect best score & threshold
    if m>best_score:
        best_score = m 
        best_threshold = threshold

0.40, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.80, 

In [33]:
# import matplotlib.pylab as plt
# # Plot Threshold vs F1 score
# plt.figure(figsize=(20,5))
# plt.plot(thresholds, scores, '-o', color='blue') # plot all threshold & scores
# plt.scatter([best_threshold], [best_score], color='blue', s=300, alpha=1) # make best point are more bigger
# plt.xlabel('Threshold', size = 14)
# plt.ylabel('Validation F1 score', size = 14)
# plt.title(f'Threshold vs. F1_score with best F1_score = {best_score:.3f} at Best Threshold = {best_threshold:.3f}', size = 18)
# plt.show()

In [34]:
print('When using optimal threshold...')
for k in range(18):
    # Compute F1 score per question
    m = f1_score(actual[k].values, (oof[k].values>best_threshold).astype('int'), average='macro') # Use best_threshold as threshold we choose
    # if values > best_threshold we will set as true or have prob to be correct in question.
    print(f'Q{k}: F1 = {m:.3f}')
 
# Compute F1 score overall
m = f1_score(actual.values.reshape((-1)).astype('int'), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print(f'Overall F1 Score: {m:.3f}')

When using optimal threshold...
Q0: F1 = 0.604
Q1: F1 = 0.497
Q2: F1 = 0.491
Q3: F1 = 0.609
Q4: F1 = 0.569
Q5: F1 = 0.604
Q6: F1 = 0.584
Q7: F1 = 0.537
Q8: F1 = 0.595
Q9: F1 = 0.511
Q10: F1 = 0.591
Q11: F1 = 0.497
Q12: F1 = 0.433
Q13: F1 = 0.604
Q14: F1 = 0.499
Q15: F1 = 0.462
Q16: F1 = 0.525
Q17: F1 = 0.491
Overall F1 Score: 0.677


### Training

In [36]:
VALID_USER_LIST = valid_x.index.unique()
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

models = {}
evaluation_dict ={}

In [1]:
for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)
    
        
    # Filter the rows in the datasets based on the selected level group. 
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]

    # There's one more step required before we can train the model. 
    # We need to convert the datatset from Pandas format (pd.DataFrame)
    # into TensorFlow Datasets format (tf.data.Dataset).
    # TensorFlow Datasets is a high performance data loading library 
    # which is helpful when training neural networks with accelerators like GPUs and TPUs.
    # We are omitting `level_group`, since it is not needed for training anymore.
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df.loc[:, train_df.columns != 'level_group'], label="correct")
    valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df.loc[:, valid_df.columns != 'level_group'], label="correct")

    # By default the model is set to train for a classification task.
    rfm = tfdf.keras.RandomForestModel(max_depth=15, num_trees=100)
    rfm.compile(metrics=["accuracy"])

    # Train the model.
    rfm.fit(x=train_ds)

    # Store the model
    models[f'{grp}_{q_no}'] = rfm

    # Evaluate the trained model on the validation dataset and store the 
    # evaluation accuracy in the `evaluation_dict`.
    inspector = rfm.make_inspector()
    inspector.evaluation()
    evaluation = rfm.evaluate(x=valid_ds,return_dict=True)
    evaluation_dict[q_no] = evaluation["accuracy"]         

    # Use the trained model to make predictions on the validation dataset and 
    # store the predicted values in the `prediction_df` dataframe.
    predict = rfm.predict(x=valid_ds)
    prediction_df.loc[valid_users, q_no-1] = predict.flatten()

### q_no 1 grp 0-4


NameError: name 'train_x' is not defined

### Threshold-Moving for Imbalanced Classification

In [35]:
# Create a dataframe of required size:
# (no: of users in validation set x no: of questions) initialized to zero values
# to store true values of the label `correct`. 
true_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)
for i in range(18):
    # Get the true labels.
    tmp = labels.loc[labels.q == i+1].set_index('session').loc[VALID_USER_LIST]
    true_df[i] = tmp.correct.values

max_score = 0; best_threshold = 0

# Loop through threshold values from 0.4 to 0.8 and select the threshold with 
# the highest `F1 score`.
for threshold in np.arange(0.4,0.8,0.01):
    metric = tfa.metrics.F1Score(num_classes=2,average="macro",threshold=threshold)
    y_true = tf.one_hot(true_df.values.reshape((-1)), depth=2)
    y_pred = tf.one_hot((prediction_df.values.reshape((-1))>threshold).astype('int'), depth=2)
    metric.update_state(y_true, y_pred)
    f1_score = metric.result().numpy()
    if f1_score > max_score:
        max_score = f1_score
        best_threshold = threshold
        
print("Best threshold ", best_threshold, "\tF1 score ", max_score)

NameError: name 'VALID_USER_LIST' is not defined

### Submission

In [None]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook


import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        gbtm = models[f'{grp}_{t}']
        test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df.loc[:, test_df.columns != 'level_group'])
        predictions = gbtm.predict(test_ds)
        mask = sample_submission.session_id.str.contains(f'q{t}')
        n_predictions = (predictions > best_threshold).astype(int)
        sample_submission.loc[mask,'correct'] = n_predictions.flatten()
    
    env.predict(sample_submission)

In [None]:
! head submission.csv