# Student Performance from Game Play Using TensorFlow Decision Forests

## Import the Required Libraries

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_decision_forests as tfdf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load the Dataset

In [2]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (26296946, 20)


In [3]:
# Display the first 5 examples
dataset_df.head(5)

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [4]:
dataset_df.event_name.value_counts()

navigate_click        11326433
person_click           6052853
cutscene_click         2703035
object_click           2198211
object_hover           1057085
map_hover               945159
notification_click      649001
notebook_click          564544
map_click               517242
observation_click       212355
checkpoint               71028
Name: event_name, dtype: int64

In [5]:
# x = dataset_df['room_coor_x']
# y = dataset_df['room_coor_y']
# plt.hexbin(x, y, gridsize=40)
# plt.show()

## Load the labels

In [6]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [7]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [8]:
# Display the first 5 examples
labels.head(5)

Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


# Prepare the dataset

In [9]:
CATEGORICAL = ['event_name', 'name','fqid']
NUMERICAL = ['level', 'hover_duration']

In [10]:
# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [11]:
dataset_df = feature_engineer(dataset_df)
print("Full prepared dataset shape is {}".format(dataset_df.shape))

Full prepared dataset shape is (70686, 8)


Our feature engineered dataset is composed of 22 columns and 70686 entries.

In [12]:
# Group k Fold split data
from sklearn.model_selection import KFold, GroupKFold
import numpy as np
ALL_USERS = dataset_df.index.unique()
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS) # set all users to 18 columns as each question & set index to session_id
model = {}

In [13]:
oof.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20090312431273200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312433251036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312455206810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090313091715820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090313571836404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Prepared dataset

In [14]:
# split the dataset into training and testing datasets

def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset_df.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

56547 examples in training, 14139 examples in testing.


## Select a Model

- RandomForestModel
- GradientBoostedTreesModel
- CartModel
- DistributedGradientBoostedTreesModel

In [15]:
tfdf.keras.get_all_models()

[tensorflow_decision_forests.keras.RandomForestModel,
 tensorflow_decision_forests.keras.GradientBoostedTreesModel,
 tensorflow_decision_forests.keras.CartModel,
 tensorflow_decision_forests.keras.DistributedGradientBoostedTreesModel]

# Training

In [16]:
# Fetch the unique list of user sessions in the validation dataset. We assigned 
# `session_id` as the index of our feature engineered dataset. Hence fetching 
# the unique values in the index column will give us a list of users in the 
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is: 
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s. 
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

# Create an empty dictionary to store the models created for each question.
models = {}

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict ={}

In [17]:
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)
    
        
    # Filter the rows in the datasets based on the selected level group. 
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]

    # There's one more step required before we can train the model. 
    # We need to convert the datatset from Pandas format (pd.DataFrame)
    # into TensorFlow Datasets format (tf.data.Dataset).
    # TensorFlow Datasets is a high performance data loading library 
    # which is helpful when training neural networks with accelerators like GPUs and TPUs.
    # We are omitting `level_group`, since it is not needed for training anymore.
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df.loc[:, train_df.columns != 'level_group'], label="correct")
    valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df.loc[:, valid_df.columns != 'level_group'], label="correct")

    # By default the model is set to train for a classification task.
    rfm = tfdf.keras.RandomForestModel(max_depth=15, num_trees=100)
    rfm.compile(metrics=["accuracy"])

    # Train the model.
    rfm.fit(x=train_ds)

    # Store the model
    models[f'{grp}_{q_no}'] = rfm

    # Evaluate the trained model on the validation dataset and store the 
    # evaluation accuracy in the `evaluation_dict`.
    inspector = rfm.make_inspector()
    inspector.evaluation()
    evaluation = rfm.evaluate(x=valid_ds,return_dict=True)
    evaluation_dict[q_no] = evaluation["accuracy"]         

    # Use the trained model to make predictions on the validation dataset and 
    # store the predicted values in the `prediction_df` dataframe.
    predict = rfm.predict(x=valid_ds)
    prediction_df.loc[valid_users, q_no-1] = predict.flatten()

### q_no 1 grp 0-4
Use /tmp/tmpak3hdzaf as temporary training directory


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Reading training dataset...
Training dataset read in 0:00:07.827994. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:21:46.094806826+00:00 kernel.cc:1214] Loading model from path /tmp/tmpak3hdzaf/model/ with prefix 37b68287e46f4c88


Model trained in 0:00:03.977500
Compiling model...


[INFO 2023-05-14T14:21:46.684138791+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 145728 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:21:46.684192119+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:21:46.68493589+00:00 kernel.cc:1046] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.
### q_no 2 grp 0-4
Use /tmp/tmplaaldp5s as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.566027. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:21:56.584146888+00:00 kernel.cc:1214] Loading model from path /tmp/tmplaaldp5s/model/ with prefix 5ac8566dcafe4087
[INFO 2023-05-14T14:21:56.781404843+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 51904 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:21:56.781514488+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:21:56.781541563+00:00 kernel.cc:1046] Use fast generic engine


Model trained in 0:00:02.480469
Compiling model...
Model compiled.
### q_no 3 grp 0-4
Use /tmp/tmpq0fl4zvt as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.336538. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:02.946745229+00:00 kernel.cc:1214] Loading model from path /tmp/tmpq0fl4zvt/model/ with prefix f2a1a6a8662b4511


Model trained in 0:00:03.208819
Compiling model...
Model compiled.


[INFO 2023-05-14T14:22:03.304987977+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 91036 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:03.305048636+00:00 kernel.cc:1046] Use fast generic engine


### q_no 4 grp 5-12
Use /tmp/tmpdmi0yipd as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.319712. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:07.683504615+00:00 kernel.cc:1214] Loading model from path /tmp/tmpdmi0yipd/model/ with prefix 6ce78c98e2b549c1


Model trained in 0:00:04.089105
Compiling model...


[INFO 2023-05-14T14:22:08.298358558+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 164566 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:08.298412791+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:22:08.298449419+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 5 grp 5-12
Use /tmp/tmp1ueoc8ao as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.315749. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:13.267842075+00:00 kernel.cc:1214] Loading model from path /tmp/tmp1ueoc8ao/model/ with prefix 1f0298e677914142


Model trained in 0:00:03.905894
Compiling model...


[INFO 2023-05-14T14:22:13.963453531+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 174722 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:13.963531539+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 6 grp 5-12
Use /tmp/tmphx6b1rqm as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.327270. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:18.136476808+00:00 kernel.cc:1214] Loading model from path /tmp/tmphx6b1rqm/model/ with prefix e924ba2658a24685


Model trained in 0:00:03.754682
Compiling model...


[INFO 2023-05-14T14:22:18.787818801+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 164974 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:18.787874514+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:22:18.787900585+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 7 grp 5-12
Use /tmp/tmpbgu6d3lu as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.332664. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:24.026213465+00:00 kernel.cc:1214] Loading model from path /tmp/tmpbgu6d3lu/model/ with prefix 8a20e1d4b3154c03


Model trained in 0:00:03.831452
Compiling model...


[INFO 2023-05-14T14:22:24.672324521+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 160228 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:24.672388435+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 8 grp 5-12
Use /tmp/tmpm3vv7g7d as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.329994. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:28.641529203+00:00 kernel.cc:1214] Loading model from path /tmp/tmpm3vv7g7d/model/ with prefix e57a4066c66c40c2


Model trained in 0:00:03.550609
Compiling model...
Model compiled.


[INFO 2023-05-14T14:22:29.153131986+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 131774 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:29.153187529+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:22:29.153213589+00:00 kernel.cc:1046] Use fast generic engine


### q_no 9 grp 5-12
Use /tmp/tmpx6rzkbz8 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.325631. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:34.557564457+00:00 kernel.cc:1214] Loading model from path /tmp/tmpx6rzkbz8/model/ with prefix 092c871416904a59


Model trained in 0:00:03.824778
Compiling model...


[INFO 2023-05-14T14:22:35.189846678+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 161216 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:35.189906497+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 10 grp 5-12
Use /tmp/tmp9tgv4y6n as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.338845. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:39.699750517+00:00 kernel.cc:1214] Loading model from path /tmp/tmp9tgv4y6n/model/ with prefix b1a81bbf49e14eaf


Model trained in 0:00:04.105667
Compiling model...


[INFO 2023-05-14T14:22:40.3149395+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 156556 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:40.314997102+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:22:40.315023741+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 11 grp 5-12
Use /tmp/tmpha_ust3l as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.326423. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:44.197400702+00:00 kernel.cc:1214] Loading model from path /tmp/tmpha_ust3l/model/ with prefix 694e556e57274a28


Model trained in 0:00:03.484896
Compiling model...
Model compiled.


[INFO 2023-05-14T14:22:44.705061279+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 129900 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:44.70512139+00:00 kernel.cc:1046] Use fast generic engine


### q_no 12 grp 5-12
Use /tmp/tmp3_oibxyt as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.331785. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:48.772424302+00:00 kernel.cc:1214] Loading model from path /tmp/tmp3_oibxyt/model/ with prefix 24b96d9b2c0c4f9d


Model trained in 0:00:03.518737
Compiling model...


[INFO 2023-05-14T14:22:49.266826155+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 125994 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:49.266888617+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 13 grp 5-12
Use /tmp/tmp25e7cxy_ as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.329872. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:54.716591339+00:00 kernel.cc:1214] Loading model from path /tmp/tmp25e7cxy_/model/ with prefix 2ec43036d1144c8f


Model trained in 0:00:03.690657
Compiling model...
Model compiled.


[INFO 2023-05-14T14:22:55.290696937+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 148218 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:22:55.290751243+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:22:55.290777993+00:00 kernel.cc:1046] Use fast generic engine


### q_no 14 grp 13-22
Use /tmp/tmp_nqj7ru4 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.330310. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:22:59.551683102+00:00 kernel.cc:1214] Loading model from path /tmp/tmp_nqj7ru4/model/ with prefix 420df92c1b3441ef


Model trained in 0:00:03.833282
Compiling model...


[INFO 2023-05-14T14:23:00.164878495+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 155906 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:23:00.164937046+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 15 grp 13-22
Use /tmp/tmp_nzizt58 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.354397. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:23:04.396953894+00:00 kernel.cc:1214] Loading model from path /tmp/tmp_nzizt58/model/ with prefix 21063291663b41ae


Model trained in 0:00:03.753807
Compiling model...


[INFO 2023-05-14T14:23:04.988263817+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 149654 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:23:04.98833091+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 16 grp 13-22
Use /tmp/tmpyvhhz1ef as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.343473. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:23:08.835981418+00:00 kernel.cc:1214] Loading model from path /tmp/tmpyvhhz1ef/model/ with prefix 24a3189a716b47b6


Model trained in 0:00:03.173248
Compiling model...
Model compiled.


[INFO 2023-05-14T14:23:09.185709837+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 86548 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:23:09.18581452+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:23:09.185854288+00:00 kernel.cc:1046] Use fast generic engine


### q_no 17 grp 13-22
Use /tmp/tmpoy5iwoa7 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.320594. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:23:14.826124267+00:00 kernel.cc:1214] Loading model from path /tmp/tmpoy5iwoa7/model/ with prefix 437d889f36874b71


Model trained in 0:00:03.558785
Compiling model...
Model compiled.


[INFO 2023-05-14T14:23:15.340338662+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 134100 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:23:15.340416787+00:00 kernel.cc:1046] Use fast generic engine


### q_no 18 grp 13-22
Use /tmp/tmpy5v39885 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.322273. Found 18849 examples.
Training model...


[INFO 2023-05-14T14:23:20.282821718+00:00 kernel.cc:1214] Loading model from path /tmp/tmpy5v39885/model/ with prefix 6d4c791b7f514341


Model trained in 0:00:03.011148
Compiling model...
Model compiled.


[INFO 2023-05-14T14:23:20.584864787+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 82658 node(s), and 7 input feature(s).
[INFO 2023-05-14T14:23:20.584929684+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-14T14:23:20.58495402+00:00 kernel.cc:1046] Use fast generic engine




# Inspect the Accuracy of the models.

In [18]:
for name, value in evaluation_dict.items():
  print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

question 1: accuracy 0.7282
question 2: accuracy 0.9756
question 3: accuracy 0.9353
question 4: accuracy 0.7914
question 5: accuracy 0.6015
question 6: accuracy 0.7819
question 7: accuracy 0.7411
question 8: accuracy 0.6308
question 9: accuracy 0.7621
question 10: accuracy 0.5886
question 11: accuracy 0.6478
question 12: accuracy 0.8699
question 13: accuracy 0.7167
question 14: accuracy 0.7295
question 15: accuracy 0.5799
question 16: accuracy 0.7475
question 17: accuracy 0.7021
question 18: accuracy 0.9516

Average accuracy 0.7489803598986732


# Visualize the model

In [19]:
# tfdf.model_plotter.plot_model_in_colab(models['0-4_1'], tree_idx=0, max_depth=3)

# Variable importances

Variable importances generally indicate how much a feature contributes to the model predictions or quality. There are several ways to identify important features using TensorFlow Decision Forests. Let us pick one model from models dict and inspect it.

Let us list the available Variable Importances for Decision Trees:

In [20]:
inspector = models['0-4_1'].make_inspector()

print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)

Available variable importances:
	 SUM_SCORE
	 INV_MEAN_MIN_DEPTH
	 NUM_NODES
	 NUM_AS_ROOT


In [21]:
# Each line is: (feature name, (index of the feature), importance score)
inspector.variable_importances()["NUM_AS_ROOT"]

[("name_nunique" (1; #7), 48.0),
 ("event_name_nunique" (1; #1), 34.0),
 ("level" (1; #5), 15.0),
 ("fqid_nunique" (1; #2), 3.0)]

# Threshold-Moving for Imbalanced Classification

In [22]:
# Create a dataframe of required size:
# (no: of users in validation set x no: of questions) initialized to zero values
# to store true values of the label `correct`. 
true_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)
for i in range(18):
    # Get the true labels.
    tmp = labels.loc[labels.q == i+1].set_index('session').loc[VALID_USER_LIST]
    true_df[i] = tmp.correct.values

max_score = 0; best_threshold = 0

# Loop through threshold values from 0.4 to 0.8 and select the threshold with 
# the highest `F1 score`.
for threshold in np.arange(0.4,0.8,0.01):
    metric = tfa.metrics.F1Score(num_classes=2,average="macro",threshold=threshold)
    y_true = tf.one_hot(true_df.values.reshape((-1)), depth=2)
    y_pred = tf.one_hot((prediction_df.values.reshape((-1))>threshold).astype('int'), depth=2)
    metric.update_state(y_true, y_pred)
    f1_score = metric.result().numpy()
    if f1_score > max_score:
        max_score = f1_score
        best_threshold = threshold
        
print("Best threshold ", best_threshold, "\tF1 score ", max_score)

Best threshold  0.7600000000000003 	F1 score  0.6571636


# Submission

Here you'll use the `best_threshold` calculate in the previous cell

In [23]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook


import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        gbtm = models[f'{grp}_{t}']
        test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df.loc[:, test_df.columns != 'level_group'])
        predictions = gbtm.predict(test_ds)
        mask = sample_submission.session_id.str.contains(f'q{t}')
        n_predictions = (predictions > best_threshold).astype(int)
        sample_submission.loc[mask,'correct'] = n_predictions.flatten()
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [24]:
! head submission.csv

session_id,correct
20090109393214576_q1,1
20090109393214576_q2,1
20090109393214576_q3,1
20090109393214576_q4,1
20090109393214576_q5,1
20090109393214576_q6,1
20090109393214576_q7,1
20090109393214576_q8,0
20090109393214576_q9,1
