# Student Performance from Game Play Using TensorFlow Decision Forests

## Import the Required Libraries

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_decision_forests as tfdf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
print("TensorFlow Decision Forests v" + tfdf.__version__)
print("TensorFlow Addons v" + tfa.__version__)
print("TensorFlow v" + tf.__version__)

TensorFlow Decision Forests v1.2.0
TensorFlow Addons v0.19.0
TensorFlow v2.11.0


## Load the Dataset

In [3]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (26296946, 20)


In [4]:
# Display the first 5 examples
dataset_df.head(5)

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [5]:
dataset_df.event_name.value_counts()

navigate_click        11326433
person_click           6052853
cutscene_click         2703035
object_click           2198211
object_hover           1057085
map_hover               945159
notification_click      649001
notebook_click          564544
map_click               517242
observation_click       212355
checkpoint               71028
Name: event_name, dtype: int64

In [6]:
# x = dataset_df['room_coor_x']
# y = dataset_df['room_coor_y']
# plt.hexbin(x, y, gridsize=40)
# plt.show()

## Load the labels

In [7]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [8]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [9]:
# Display the first 5 examples
labels.head(5)

Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


# Prepare the dataset

In [10]:
CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

In [11]:
# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [12]:
dataset_df = feature_engineer(dataset_df)
print("Full prepared dataset shape is {}".format(dataset_df.shape))

Full prepared dataset shape is (70686, 22)


Our feature engineered dataset is composed of 22 columns and 70686 entries.

In [13]:
# Group k Fold split data
from sklearn.model_selection import KFold, GroupKFold
import numpy as np
ALL_USERS = dataset_df.index.unique()
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS) # set all users to 18 columns as each question & set index to session_id
model = {}

In [14]:
oof.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20090312431273200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312433251036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312455206810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090313091715820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090313571836404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Prepared dataset

In [15]:
# split the dataset into training and testing datasets

def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset_df.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

56547 examples in training, 14139 examples in testing.


## Select a Model

- RandomForestModel
- GradientBoostedTreesModel
- CartModel
- DistributedGradientBoostedTreesModel

In [16]:
tfdf.keras.get_all_models()

[tensorflow_decision_forests.keras.RandomForestModel,
 tensorflow_decision_forests.keras.GradientBoostedTreesModel,
 tensorflow_decision_forests.keras.CartModel,
 tensorflow_decision_forests.keras.DistributedGradientBoostedTreesModel]

# Training

In [17]:
# Fetch the unique list of user sessions in the validation dataset. We assigned 
# `session_id` as the index of our feature engineered dataset. Hence fetching 
# the unique values in the index column will give us a list of users in the 
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is: 
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s. 
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

# Create an empty dictionary to store the models created for each question.
models = {}

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict ={}

In [18]:
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)
    
        
    # Filter the rows in the datasets based on the selected level group. 
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]

    # There's one more step required before we can train the model. 
    # We need to convert the datatset from Pandas format (pd.DataFrame)
    # into TensorFlow Datasets format (tf.data.Dataset).
    # TensorFlow Datasets is a high performance data loading library 
    # which is helpful when training neural networks with accelerators like GPUs and TPUs.
    # We are omitting `level_group`, since it is not needed for training anymore.
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df.loc[:, train_df.columns != 'level_group'], label="correct")
    valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df.loc[:, valid_df.columns != 'level_group'], label="correct")

    # By default the model is set to train for a classification task.
    rfm = tfdf.keras.RandomForestModel(max_depth=15, num_trees=100)
    rfm.compile(metrics=["accuracy"])

    # Train the model.
    rfm.fit(x=train_ds)

    # Store the model
    models[f'{grp}_{q_no}'] = rfm

    # Evaluate the trained model on the validation dataset and store the 
    # evaluation accuracy in the `evaluation_dict`.
    inspector = rfm.make_inspector()
    inspector.evaluation()
    evaluation = rfm.evaluate(x=valid_ds,return_dict=True)
    evaluation_dict[q_no] = evaluation["accuracy"]         

    # Use the trained model to make predictions on the validation dataset and 
    # store the predicted values in the `prediction_df` dataframe.
    predict = rfm.predict(x=valid_ds)
    prediction_df.loc[valid_users, q_no-1] = predict.flatten()

### q_no 1 grp 0-4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Use /tmp/tmp0pn_cd_v as temporary training directory
Reading training dataset...
Training dataset read in 0:00:09.157936. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:20:17.361970577+00:00 kernel.cc:1214] Loading model from path /tmp/tmp0pn_cd_v/model/ with prefix 7e905226336149b5


Model trained in 0:00:06.660663
Compiling model...


[INFO 2023-05-12T13:20:18.062831669+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 163502 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:20:18.063388535+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:20:18.064710609+00:00 kernel.cc:1046] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.
### q_no 2 grp 0-4
Use /tmp/tmp8nc9pxi4 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.870485. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:20:27.393974229+00:00 kernel.cc:1214] Loading model from path /tmp/tmp8nc9pxi4/model/ with prefix 4a803b6015834dd8


Model trained in 0:00:03.986365
Compiling model...


[INFO 2023-05-12T13:20:27.606605556+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 50820 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:20:27.606977075+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 3 grp 0-4
Use /tmp/tmpvc76ma2y as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.844749. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:20:34.104865355+00:00 kernel.cc:1214] Loading model from path /tmp/tmpvc76ma2y/model/ with prefix ccfabccd026c4d71


Model trained in 0:00:05.014725
Compiling model...


[INFO 2023-05-12T13:20:34.506463495+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 96322 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:20:34.506523389+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:20:34.506553641+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 4 grp 5-12
Use /tmp/tmpnedb52q0 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.803023. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:20:41.587148104+00:00 kernel.cc:1214] Loading model from path /tmp/tmpnedb52q0/model/ with prefix 42ae8b79b5884537


Model trained in 0:00:05.984504
Compiling model...


[INFO 2023-05-12T13:20:42.341385199+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 175970 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:20:42.34146218+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 5 grp 5-12
Use /tmp/tmpysmayj06 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.779734. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:20:50.088155164+00:00 kernel.cc:1214] Loading model from path /tmp/tmpysmayj06/model/ with prefix d9055fa487d54ba6


Model trained in 0:00:06.815797
Compiling model...


[INFO 2023-05-12T13:20:50.937434581+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 197486 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:20:50.937497971+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:20:50.937529843+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 6 grp 5-12
Use /tmp/tmps5auzip4 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.838875. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:20:58.242172455+00:00 kernel.cc:1214] Loading model from path /tmp/tmps5auzip4/model/ with prefix ecb32b088fdb432c


Model trained in 0:00:06.078132
Compiling model...


[INFO 2023-05-12T13:20:58.960382842+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 174286 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:20:58.960460276+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 7 grp 5-12
Use /tmp/tmprho0o_p9 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.803465. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:21:09.902320912+00:00 kernel.cc:1214] Loading model from path /tmp/tmprho0o_p9/model/ with prefix 7742946bae0a497d


Model trained in 0:00:06.244315
Compiling model...


[INFO 2023-05-12T13:21:10.654015663+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 180944 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:21:10.654115531+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:21:10.654155574+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 8 grp 5-12
Use /tmp/tmpc3fu6dlp as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.853254. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:21:21.530218856+00:00 kernel.cc:1214] Loading model from path /tmp/tmpc3fu6dlp/model/ with prefix ca1ee2347a114965


Model trained in 0:00:06.292519
Compiling model...


[INFO 2023-05-12T13:21:22.139177443+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 148534 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:21:22.139505172+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:21:22.139540054+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 9 grp 5-12
Use /tmp/tmp8rmqliwp as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.853673. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:21:29.567360331+00:00 kernel.cc:1214] Loading model from path /tmp/tmp8rmqliwp/model/ with prefix 4faf9e3f9af348d9


Model trained in 0:00:06.231668
Compiling model...


[INFO 2023-05-12T13:21:30.290432111+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 172382 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:21:30.29068956+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 10 grp 5-12
Use /tmp/tmp6rwcxxtw as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.857873. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:21:41.123542425+00:00 kernel.cc:1214] Loading model from path /tmp/tmp6rwcxxtw/model/ with prefix 607c264a4f4a41ff


Model trained in 0:00:06.403152
Compiling model...


[INFO 2023-05-12T13:21:41.876610801+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 182310 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:21:41.876676492+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:21:41.876707351+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 11 grp 5-12
Use /tmp/tmp8azl_i_r as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.833282. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:21:49.226231433+00:00 kernel.cc:1214] Loading model from path /tmp/tmp8azl_i_r/model/ with prefix b03286fc8a0f40f6


Model trained in 0:00:06.108903
Compiling model...


[INFO 2023-05-12T13:21:49.886209777+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 157792 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:21:49.886325517+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 12 grp 5-12
Use /tmp/tmpfww4lz0l as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.796881. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:21:56.732225519+00:00 kernel.cc:1214] Loading model from path /tmp/tmpfww4lz0l/model/ with prefix aa9d6f6d79484c64


Model trained in 0:00:05.542440
Compiling model...


[INFO 2023-05-12T13:21:57.291829563+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 131780 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:21:57.291917344+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:21:57.291959454+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 13 grp 5-12
Use /tmp/tmpvr6gfke3 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.820401. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:22:04.505137891+00:00 kernel.cc:1214] Loading model from path /tmp/tmpvr6gfke3/model/ with prefix 4af669807c214452


Model trained in 0:00:05.884319
Compiling model...


[INFO 2023-05-12T13:22:05.173260011+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 158546 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:22:05.173335533+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 14 grp 13-22
Use /tmp/tmpfj6rn1e_ as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.752295. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:22:12.286659035+00:00 kernel.cc:1214] Loading model from path /tmp/tmpfj6rn1e_/model/ with prefix 35c58a962cda47a9


Model trained in 0:00:06.162026
Compiling model...


[INFO 2023-05-12T13:22:13.035159991+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 184954 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:22:13.035221585+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:22:13.035251445+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 15 grp 13-22
Use /tmp/tmpck6jb_e2 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.757092. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:22:20.57311848+00:00 kernel.cc:1214] Loading model from path /tmp/tmpck6jb_e2/model/ with prefix 80aba21ccdeb43c9


Model trained in 0:00:06.227841
Compiling model...


[INFO 2023-05-12T13:22:21.357119805+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 195254 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:22:21.357613545+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 16 grp 13-22
Use /tmp/tmpxm10iygr as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.732361. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:22:31.135670597+00:00 kernel.cc:1214] Loading model from path /tmp/tmpxm10iygr/model/ with prefix 44e12f0a063a4851


Model trained in 0:00:05.146455
Compiling model...


[INFO 2023-05-12T13:22:31.57177078+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 109178 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:22:31.571861443+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:22:31.57189283+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 17 grp 13-22
Use /tmp/tmprqyixrca as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.771528. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:22:42.913089443+00:00 kernel.cc:1214] Loading model from path /tmp/tmprqyixrca/model/ with prefix e9ded384ac744d37


Model trained in 0:00:05.659689
Compiling model...


[INFO 2023-05-12T13:22:43.46779351+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 138226 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:22:43.46785803+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:22:43.467887346+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 18 grp 13-22
Use /tmp/tmpibu4xj33 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.748622. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:22:53.491835231+00:00 kernel.cc:1214] Loading model from path /tmp/tmpibu4xj33/model/ with prefix 09133086a5af44a7


Model trained in 0:00:04.779959
Compiling model...


[INFO 2023-05-12T13:22:53.863975673+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 88932 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:22:53.864071483+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:22:53.864111386+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.


# Inspect the Accuracy of the models.

In [19]:
for name, value in evaluation_dict.items():
  print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

question 1: accuracy 0.7307
question 2: accuracy 0.9756
question 3: accuracy 0.9351
question 4: accuracy 0.7942
question 5: accuracy 0.6270
question 6: accuracy 0.7902
question 7: accuracy 0.7460
question 8: accuracy 0.6374
question 9: accuracy 0.7664
question 10: accuracy 0.6017
question 11: accuracy 0.6535
question 12: accuracy 0.8697
question 13: accuracy 0.7208
question 14: accuracy 0.7327
question 15: accuracy 0.5992
question 16: accuracy 0.7490
question 17: accuracy 0.7036
question 18: accuracy 0.9516

Average accuracy 0.754685617155499


# Visualize the model

In [20]:
tfdf.model_plotter.plot_model_in_colab(models['0-4_1'], tree_idx=0, max_depth=3)

# Variable importances

Variable importances generally indicate how much a feature contributes to the model predictions or quality. There are several ways to identify important features using TensorFlow Decision Forests. Let us pick one model from models dict and inspect it.

Let us list the available Variable Importances for Decision Trees:

In [21]:
inspector = models['0-4_1'].make_inspector()

print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)

Available variable importances:
	 NUM_NODES
	 SUM_SCORE
	 NUM_AS_ROOT
	 INV_MEAN_MIN_DEPTH


In [22]:
# Each line is: (feature name, (index of the feature), importance score)
inspector.variable_importances()["NUM_AS_ROOT"]

[("room_fqid_nunique" (1; #16), 24.0),
 ("page" (1; #10), 17.0),
 ("page_std" (1; #11), 15.0),
 ("name_nunique" (1; #9), 13.0),
 ("elapsed_time" (1; #1), 12.0),
 ("elapsed_time_std" (1; #2), 9.0),
 ("event_name_nunique" (1; #3), 4.0),
 ("level" (1; #7), 4.0),
 ("fqid_nunique" (1; #4), 1.0),
 ("text_fqid_nunique" (1; #21), 1.0)]

# Threshold-Moving for Imbalanced Classification

Since the values of the column `correct` is fairly imbalanced, using the default threshold of `0.5` to map the predictions into classes 0 or 1 can result in poor performance. 
In such cases, to improve performance we will calculate the `F1 score` for a certain range of thresholds and try to find the best threshold aka, threshold with highest `F1 score`. Then we will use this threshold to map the predicted probabilities to class labels 0 or 1.

Please note that we are using `F1 score` since it is a better metric than `accuracy` to evaluate problems with class imbalance.

In [23]:
# Create a dataframe of required size:
# (no: of users in validation set x no: of questions) initialized to zero values
# to store true values of the label `correct`. 
true_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)
for i in range(18):
    # Get the true labels.
    tmp = labels.loc[labels.q == i+1].set_index('session').loc[VALID_USER_LIST]
    true_df[i] = tmp.correct.values

max_score = 0; best_threshold = 0

# Loop through threshold values from 0.4 to 0.8 and select the threshold with 
# the highest `F1 score`.
for threshold in np.arange(0.4,0.8,0.01):
    metric = tfa.metrics.F1Score(num_classes=2,average="macro",threshold=threshold)
    y_true = tf.one_hot(true_df.values.reshape((-1)), depth=2)
    y_pred = tf.one_hot((prediction_df.values.reshape((-1))>threshold).astype('int'), depth=2)
    metric.update_state(y_true, y_pred)
    f1_score = metric.result().numpy()
    if f1_score > max_score:
        max_score = f1_score
        best_threshold = threshold
        
print("Best threshold ", best_threshold, "\tF1 score ", max_score)

Best threshold  0.7000000000000003 	F1 score  0.6714021


# Submission

Here you'll use the `best_threshold` calculate in the previous cell

In [24]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook


import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        gbtm = models[f'{grp}_{t}']
        test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df.loc[:, test_df.columns != 'level_group'])
        predictions = gbtm.predict(test_ds)
        mask = sample_submission.session_id.str.contains(f'q{t}')
        n_predictions = (predictions > best_threshold).astype(int)
        sample_submission.loc[mask,'correct'] = n_predictions.flatten()
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [25]:
! head submission.csv

session_id,correct
20090109393214576_q1,1
20090109393214576_q2,1
20090109393214576_q3,1
20090109393214576_q4,1
20090109393214576_q5,0
20090109393214576_q6,1
20090109393214576_q7,1
20090109393214576_q8,1
20090109393214576_q9,1
