# Student Performance from Game Play Using TensorFlow Decision Forests

## Import the Required Libraries

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_decision_forests as tfdf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
print("TensorFlow Decision Forests v" + tfdf.__version__)
print("TensorFlow Addons v" + tfa.__version__)
print("TensorFlow v" + tf.__version__)

TensorFlow Decision Forests v1.2.0
TensorFlow Addons v0.19.0
TensorFlow v2.11.0


## Load the Dataset

In [3]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (26296946, 20)


In [4]:
# Display the first 5 examples
dataset_df.head(5)

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [5]:
dataset_df.event_name.value_counts()

navigate_click        11326433
person_click           6052853
cutscene_click         2703035
object_click           2198211
object_hover           1057085
map_hover               945159
notification_click      649001
notebook_click          564544
map_click               517242
observation_click       212355
checkpoint               71028
Name: event_name, dtype: int64

In [6]:
# x = dataset_df['room_coor_x']
# y = dataset_df['room_coor_y']
# plt.hexbin(x, y, gridsize=40)
# plt.show()

## Load the labels

In [7]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [8]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [9]:
# Display the first 5 examples
labels.head(5)

Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


# Prepare the dataset

In [10]:
CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

In [11]:
# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [12]:
dataset_df = feature_engineer(dataset_df)
print("Full prepared dataset shape is {}".format(dataset_df.shape))

Full prepared dataset shape is (70686, 22)


Our feature engineered dataset is composed of 22 columns and 70686 entries.

In [13]:
# Group k Fold split data
from sklearn.model_selection import KFold, GroupKFold
import numpy as np
ALL_USERS = dataset_df.index.unique()
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS) # set all users to 18 columns as each question & set index to session_id
model = {}

In [14]:
oof.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20090312431273200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312433251036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312455206810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090313091715820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090313571836404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Prepared dataset

In [15]:
# split the dataset into training and testing datasets

def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset_df.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

56547 examples in training, 14139 examples in testing.


## Select a Model

- RandomForestModel
- GradientBoostedTreesModel
- CartModel
- DistributedGradientBoostedTreesModel

In [16]:
tfdf.keras.get_all_models()

[tensorflow_decision_forests.keras.RandomForestModel,
 tensorflow_decision_forests.keras.GradientBoostedTreesModel,
 tensorflow_decision_forests.keras.CartModel,
 tensorflow_decision_forests.keras.DistributedGradientBoostedTreesModel]

# Training

In [17]:
# Fetch the unique list of user sessions in the validation dataset. We assigned 
# `session_id` as the index of our feature engineered dataset. Hence fetching 
# the unique values in the index column will give us a list of users in the 
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is: 
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s. 
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

# Create an empty dictionary to store the models created for each question.
models = {}

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict ={}

In [18]:
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)
    
        
    # Filter the rows in the datasets based on the selected level group. 
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]

    # There's one more step required before we can train the model. 
    # We need to convert the datatset from Pandas format (pd.DataFrame)
    # into TensorFlow Datasets format (tf.data.Dataset).
    # TensorFlow Datasets is a high performance data loading library 
    # which is helpful when training neural networks with accelerators like GPUs and TPUs.
    # We are omitting `level_group`, since it is not needed for training anymore.
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df.loc[:, train_df.columns != 'level_group'], label="correct")
    valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df.loc[:, valid_df.columns != 'level_group'], label="correct")

    # By default the model is set to train for a classification task.
    rfm = tfdf.keras.RandomForestModel(max_depth=15, num_trees=100)
    rfm.compile(metrics=["accuracy"])

    # Train the model.
    rfm.fit(x=train_ds)

    # Store the model
    models[f'{grp}_{q_no}'] = rfm

    # Evaluate the trained model on the validation dataset and store the 
    # evaluation accuracy in the `evaluation_dict`.
    inspector = rfm.make_inspector()
    inspector.evaluation()
    evaluation = rfm.evaluate(x=valid_ds,return_dict=True)
    evaluation_dict[q_no] = evaluation["accuracy"]         

    # Use the trained model to make predictions on the validation dataset and 
    # store the predicted values in the `prediction_df` dataframe.
    predict = rfm.predict(x=valid_ds)
    prediction_df.loc[valid_users, q_no-1] = predict.flatten()

### q_no 1 grp 0-4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Use /tmp/tmphk4qnmcv as temporary training directory
Reading training dataset...
Training dataset read in 0:00:08.422129. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:29:08.856744192+00:00 kernel.cc:1214] Loading model from path /tmp/tmphk4qnmcv/model/ with prefix 0dcbe9434c75417e


Model trained in 0:00:05.881873
Compiling model...


[INFO 2023-05-12T13:29:09.509300169+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 163502 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:29:09.509762929+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:29:09.510748293+00:00 kernel.cc:1046] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.
### q_no 2 grp 0-4
Use /tmp/tmpv_44zyfz as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.675423. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:29:17.696473789+00:00 kernel.cc:1214] Loading model from path /tmp/tmpv_44zyfz/model/ with prefix 0691e015473f4780
[INFO 2023-05-12T13:29:17.892487173+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 50820 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:29:17.892863125+00:00 kernel.cc:1046] Use fast generic engine


Model trained in 0:00:03.664529
Compiling model...
Model compiled.
### q_no 3 grp 0-4
Use /tmp/tmpcgk1flja as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.674221. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:29:23.674588663+00:00 kernel.cc:1214] Loading model from path /tmp/tmpcgk1flja/model/ with prefix ba29de1e2d7f4f42


Model trained in 0:00:04.608425
Compiling model...


[INFO 2023-05-12T13:29:24.04839237+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 96322 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:29:24.048445579+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:29:24.048474133+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 4 grp 5-12
Use /tmp/tmpvhver0ur as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.668865. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:29:35.802018457+00:00 kernel.cc:1214] Loading model from path /tmp/tmpvhver0ur/model/ with prefix 804639862ef7466a


Model trained in 0:00:05.889655
Compiling model...


[INFO 2023-05-12T13:29:36.498714773+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 175970 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:29:36.498775576+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:29:36.498803986+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 5 grp 5-12
Use /tmp/tmpw8q8tf1h as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.680368. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:29:46.982698195+00:00 kernel.cc:1214] Loading model from path /tmp/tmpw8q8tf1h/model/ with prefix 4cd808b60e404c0e


Model trained in 0:00:05.931626
Compiling model...


[INFO 2023-05-12T13:29:47.760009369+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 197486 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:29:47.760138268+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:29:47.760167759+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 6 grp 5-12
Use /tmp/tmp1n5d7tea as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.674675. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:29:57.969298409+00:00 kernel.cc:1214] Loading model from path /tmp/tmp1n5d7tea/model/ with prefix ef4ea6bca78c40c8


Model trained in 0:00:05.601917
Compiling model...


[INFO 2023-05-12T13:29:58.65271164+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 174286 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:29:58.653260334+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:29:58.653596098+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 7 grp 5-12
Use /tmp/tmpb72enu37 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.692711. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:30:05.300726658+00:00 kernel.cc:1214] Loading model from path /tmp/tmpb72enu37/model/ with prefix daba7d81a15f46b1


Model trained in 0:00:05.756103
Compiling model...


[INFO 2023-05-12T13:30:06.064726663+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 180944 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:30:06.064805204+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 8 grp 5-12
Use /tmp/tmpchf6rup9 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.664060. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:30:16.440320985+00:00 kernel.cc:1214] Loading model from path /tmp/tmpchf6rup9/model/ with prefix 0c0e606c58a242e0


Model trained in 0:00:05.488264
Compiling model...


[INFO 2023-05-12T13:30:17.01907429+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 148534 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:30:17.019286581+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:30:17.019328644+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 9 grp 5-12
Use /tmp/tmpijyg1ook as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.665424. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:30:23.519590109+00:00 kernel.cc:1214] Loading model from path /tmp/tmpijyg1ook/model/ with prefix 09e682da9a3a475c


Model trained in 0:00:05.621674
Compiling model...


[INFO 2023-05-12T13:30:24.191265486+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 172382 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:30:24.191346122+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 10 grp 5-12
Use /tmp/tmpb7wc002h as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.661357. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:30:30.863172026+00:00 kernel.cc:1214] Loading model from path /tmp/tmpb7wc002h/model/ with prefix e7d43bb77be346dc


Model trained in 0:00:05.780089
Compiling model...


[INFO 2023-05-12T13:30:31.572789815+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 182310 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:30:31.573046197+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:30:31.573075952+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 11 grp 5-12
Use /tmp/tmp52uyt7uy as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.668562. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:30:38.421243126+00:00 kernel.cc:1214] Loading model from path /tmp/tmp52uyt7uy/model/ with prefix 533836a1563246a9


Model trained in 0:00:05.822600
Compiling model...


[INFO 2023-05-12T13:30:39.033285141+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 157792 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:30:39.033820775+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 12 grp 5-12
Use /tmp/tmppqnpl9eu as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.685330. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:30:45.30687431+00:00 kernel.cc:1214] Loading model from path /tmp/tmppqnpl9eu/model/ with prefix 4af2deb524f24add


Model trained in 0:00:05.199882
Compiling model...


[INFO 2023-05-12T13:30:45.820152545+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 131780 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:30:45.820578502+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:30:45.820650983+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 13 grp 5-12
Use /tmp/tmp_esim17d as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.681537. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:30:52.340315955+00:00 kernel.cc:1214] Loading model from path /tmp/tmp_esim17d/model/ with prefix 8f1baf65583041cd


Model trained in 0:00:05.521629
Compiling model...


[INFO 2023-05-12T13:30:52.957714644+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 158546 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:30:52.957796454+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 14 grp 13-22
Use /tmp/tmp3413p30r as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.981331. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:31:04.106003402+00:00 kernel.cc:1214] Loading model from path /tmp/tmp3413p30r/model/ with prefix de1157c758894b64


Model trained in 0:00:05.843521
Compiling model...


[INFO 2023-05-12T13:31:04.825671961+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 184954 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:31:04.825766996+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:31:04.825808047+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 15 grp 13-22
Use /tmp/tmpo38b6qyb as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.678959. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:31:11.869795748+00:00 kernel.cc:1214] Loading model from path /tmp/tmpo38b6qyb/model/ with prefix e61db06adcfc4b75


Model trained in 0:00:06.176293
Compiling model...


[INFO 2023-05-12T13:31:12.627897155+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 195254 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:31:12.627959149+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 16 grp 13-22
Use /tmp/tmpva33_xjh as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.692939. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:31:22.804676009+00:00 kernel.cc:1214] Loading model from path /tmp/tmpva33_xjh/model/ with prefix 0c25d9b505404bec


Model trained in 0:00:05.500332
Compiling model...


[INFO 2023-05-12T13:31:23.286410657+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 109178 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:31:23.286465242+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:31:23.286493077+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 17 grp 13-22
Use /tmp/tmpkfjpav96 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.715399. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:31:34.486667262+00:00 kernel.cc:1214] Loading model from path /tmp/tmpkfjpav96/model/ with prefix 28d1ae2d319c47a9


Model trained in 0:00:05.982221
Compiling model...


[INFO 2023-05-12T13:31:35.046563856+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 138226 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:31:35.047319753+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-12T13:31:35.047429169+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 18 grp 13-22
Use /tmp/tmp6tvkcc56 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.686207. Found 18849 examples.
Training model...


[INFO 2023-05-12T13:31:40.816370327+00:00 kernel.cc:1214] Loading model from path /tmp/tmp6tvkcc56/model/ with prefix ed37e6ba4a6a4749


Model trained in 0:00:04.538869
Compiling model...


[INFO 2023-05-12T13:31:41.192387729+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 88932 node(s), and 21 input feature(s).
[INFO 2023-05-12T13:31:41.19245616+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.


# Inspect the Accuracy of the models.

In [19]:
for name, value in evaluation_dict.items():
  print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

question 1: accuracy 0.7307
question 2: accuracy 0.9756
question 3: accuracy 0.9351
question 4: accuracy 0.7942
question 5: accuracy 0.6270
question 6: accuracy 0.7902
question 7: accuracy 0.7460
question 8: accuracy 0.6374
question 9: accuracy 0.7664
question 10: accuracy 0.6017
question 11: accuracy 0.6535
question 12: accuracy 0.8697
question 13: accuracy 0.7208
question 14: accuracy 0.7327
question 15: accuracy 0.5992
question 16: accuracy 0.7490
question 17: accuracy 0.7036
question 18: accuracy 0.9516

Average accuracy 0.754685617155499


# Visualize the model

In [20]:
# tfdf.model_plotter.plot_model_in_colab(models['0-4_1'], tree_idx=0, max_depth=3)

# Variable importances

Variable importances generally indicate how much a feature contributes to the model predictions or quality. There are several ways to identify important features using TensorFlow Decision Forests. Let us pick one model from models dict and inspect it.

Let us list the available Variable Importances for Decision Trees:

In [21]:
inspector = models['0-4_1'].make_inspector()

print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)

Available variable importances:
	 INV_MEAN_MIN_DEPTH
	 NUM_NODES
	 NUM_AS_ROOT
	 SUM_SCORE


In [22]:
# Each line is: (feature name, (index of the feature), importance score)
inspector.variable_importances()["NUM_AS_ROOT"]

[("room_fqid_nunique" (1; #16), 24.0),
 ("page" (1; #10), 17.0),
 ("page_std" (1; #11), 15.0),
 ("name_nunique" (1; #9), 13.0),
 ("elapsed_time" (1; #1), 12.0),
 ("elapsed_time_std" (1; #2), 9.0),
 ("event_name_nunique" (1; #3), 4.0),
 ("level" (1; #7), 4.0),
 ("fqid_nunique" (1; #4), 1.0),
 ("text_fqid_nunique" (1; #21), 1.0)]

# Threshold-Moving for Imbalanced Classification

In [23]:
# Create a dataframe of required size:
# (no: of users in validation set x no: of questions) initialized to zero values
# to store true values of the label `correct`. 
true_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)
for i in range(18):
    # Get the true labels.
    tmp = labels.loc[labels.q == i+1].set_index('session').loc[VALID_USER_LIST]
    true_df[i] = tmp.correct.values

max_score = 0; best_threshold = 0

# Loop through threshold values from 0.4 to 0.8 and select the threshold with 
# the highest `F1 score`.
for threshold in np.arange(0.4,0.8,0.01):
    metric = tfa.metrics.F1Score(num_classes=2,average="macro",threshold=threshold)
    y_true = tf.one_hot(true_df.values.reshape((-1)), depth=2)
    y_pred = tf.one_hot((prediction_df.values.reshape((-1))>threshold).astype('int'), depth=2)
    metric.update_state(y_true, y_pred)
    f1_score = metric.result().numpy()
    if f1_score > max_score:
        max_score = f1_score
        best_threshold = threshold
        
print("Best threshold ", best_threshold, "\tF1 score ", max_score)

Best threshold  0.7000000000000003 	F1 score  0.6714021


# Submission

Here you'll use the `best_threshold` calculate in the previous cell

In [24]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook


import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        gbtm = models[f'{grp}_{t}']
        test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df.loc[:, test_df.columns != 'level_group'])
        predictions = gbtm.predict(test_ds)
        mask = sample_submission.session_id.str.contains(f'q{t}')
        n_predictions = (predictions > best_threshold).astype(int)
        sample_submission.loc[mask,'correct'] = n_predictions.flatten()
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [25]:
! head submission.csv

session_id,correct
20090109393214576_q1,1
20090109393214576_q2,1
20090109393214576_q3,1
20090109393214576_q4,1
20090109393214576_q5,0
20090109393214576_q6,1
20090109393214576_q7,1
20090109393214576_q8,1
20090109393214576_q9,1
