# Student Performance from Game Play Using TensorFlow Decision Forests

## Import the Required Libraries

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_decision_forests as tfdf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score

## Load the Dataset

In [2]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)

In [3]:
dataset_df.event_name.value_counts()

navigate_click        11326433
person_click           6052853
cutscene_click         2703035
object_click           2198211
object_hover           1057085
map_hover               945159
notification_click      649001
notebook_click          564544
map_click               517242
observation_click       212355
checkpoint               71028
Name: event_name, dtype: int64

## Load the labels

In [4]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [5]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

# Prepare the dataset

In [6]:
CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']
# EVENTS = dataset_df['event_name'].unique()

In [7]:
# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
#     for c in EVENTS: 
#         dataset_df[c] = (dataset_df.event_name == c).astype('int8')
#     for c in EVENTS:
#         tmp = dataset_df[dataset_df[c] == 1].groupby(['session_id', 'level_group'])['elapsed_time_diff'].agg('mean')
#         tmp.name = c + '_time_average'
#         dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [8]:
dataset_df = feature_engineer(dataset_df)
print("Full prepared dataset shape is {}".format(dataset_df.shape))
#print(dataset_df)

Full prepared dataset shape is (70686, 22)


Our feature engineered dataset is composed of 22 columns and 70686 entries.

In [9]:
# Group k Fold split data
FEATURES = [c for c in dataset_df.columns if c != 'level_group']
ALL_USERS = dataset_df.index.unique()

gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
model = {}
evaluation_dict ={}

# oof.head()

In [10]:
labels.head()

Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [11]:
# Compute CV score with 5 group k fold
final_list = []
for i, (train_idex, test_index) in enumerate(gkf.split(X= dataset_df, groups=dataset_df.index)): # we already define index (sesion_id) for group to split here.
    # Here we know which fold contains which index, then use index to slice data in each fold.
    print('-'*25)
    print('--- Fold', i+1)
    print('-'*25)

    xgb_params = {
        'objective' : 'binary:logistic',
        'eval_metric' : 'logloss',
        'learning_rate' : 0.05,
        'max_depth' : 4,
        'n_estimators' : 1000,
        'early_stopping_rounds' : 50,
        'tree_method' : 'hist',
        'subsample' : 0.8,
        'colsample_bytree' : 0.4,
        'use_label_encoder' : False
    }

    # Iterate thru questions 1 thru 18
    # Create data set for train each question
    for t in range(1,19):
        print(t, ',', end= ' ')

        # Use this train data with these questions
        # clarify each question to assign group for slice data in df
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'

        # Train data
        train_x = dataset_df.iloc[train_idex] # get train from index in each fold
        train_x = train_x.loc[train_x.level_group == grp] # get train only each question that match group, slice this again because data in level session_id & level_grop
        train_users = train_x.index.values # get all users train in each fold
        train_y = labels.loc[labels.q==t].set_index('session').loc[train_users] # get train variable y in each question.
        

        # Valid data
        valid_x = dataset_df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = labels.loc[labels.q==t].set_index('session').loc[valid_users]
        

        # Train Model
        # Train XGBosst
        clf = XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
            eval_set=[(valid_x[FEATURES].astype('float32'), valid_y['correct'].astype('float32'))],
            verbose=0
        )
        print(f'{t}({clf.best_ntree_limit}), ', end='')
        

        # Save model
        model[f'{grp}_{t}'] = clf # Set model name
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1] # Assign predict value to oof table in each question
        # above out put is list such as [0.45779951 0.54220049] position 1 (class 0) is 0.45 position 2 (class 1) is 0.54 total is 1
        # we use [:, 1] because we predict correct = 1 is class 1 then set to [:, 1] instead of [:, 0]
print()

-------------------------
--- Fold 1
-------------------------
1 , 1(175), 2 , 2(100), 3 , 3(144), 4 , 4(211), 5 , 5(163), 6 , 6(128), 7 , 7(81), 8 , 8(67), 9 , 9(239), 10 , 10(166), 11 , 11(62), 12 , 12(125), 13 , 13(114), 14 , 14(243), 15 , 15(269), 16 , 16(65), 17 , 17(60), 18 , 18(103), -------------------------
--- Fold 2
-------------------------
1 , 1(174), 2 , 2(151), 3 , 3(114), 4 , 4(129), 5 , 5(108), 6 , 6(142), 7 , 7(147), 8 , 8(72), 9 , 9(173), 10 , 10(110), 11 , 11(120), 12 , 12(140), 13 , 13(70), 14 , 14(108), 15 , 15(237), 16 , 16(115), 17 , 17(111), 18 , 18(152), -------------------------
--- Fold 3
-------------------------
1 , 1(191), 2 , 2(114), 3 , 3(119), 4 , 4(208), 5 , 5(155), 6 , 6(101), 7 , 7(119), 8 , 8(83), 9 , 9(100), 10 , 10(127), 11 , 11(73), 12 , 12(73), 13 , 13(114), 14 , 14(146), 15 , 15(256), 16 , 16(70), 17 , 17(89), 18 , 18(166), -------------------------
--- Fold 4
-------------------------
1 , 1(186), 2 , 2(103), 3 , 3(107), 4 , 4(190), 5 , 5(115)

## Prepared dataset

In [12]:
# split the dataset into training and testing datasets

def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset_df.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

56547 examples in training, 14139 examples in testing.


## Select a Model

- RandomForestModel
- GradientBoostedTreesModel
- CartModel
- DistributedGradientBoostedTreesModel

In [13]:
tfdf.keras.get_all_models()

[tensorflow_decision_forests.keras.RandomForestModel,
 tensorflow_decision_forests.keras.GradientBoostedTreesModel,
 tensorflow_decision_forests.keras.CartModel,
 tensorflow_decision_forests.keras.DistributedGradientBoostedTreesModel]

# Training

In [14]:
# Fetch the unique list of user sessions in the validation dataset. We assigned 
# `session_id` as the index of our feature engineered dataset. Hence fetching 
# the unique values in the index column will give us a list of users in the 
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is: 
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s. 
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

# Create an empty dictionary to store the models created for each question.
models = {}

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict ={}

In [15]:
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)
    
        
    # Filter the rows in the datasets based on the selected level group. 
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]

    # There's one more step required before we can train the model. 
    # We need to convert the datatset from Pandas format (pd.DataFrame)
    # into TensorFlow Datasets format (tf.data.Dataset).
    # TensorFlow Datasets is a high performance data loading library 
    # which is helpful when training neural networks with accelerators like GPUs and TPUs.
    # We are omitting `level_group`, since it is not needed for training anymore.
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df.loc[:, train_df.columns != 'level_group'], label="correct")
    valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df.loc[:, valid_df.columns != 'level_group'], label="correct")

    # By default the model is set to train for a classification task.
    rfm = tfdf.keras.RandomForestModel(max_depth=15, num_trees=100)
    rfm.compile(metrics=["accuracy"])

    # Train the model.
    rfm.fit(x=train_ds)

    # Store the model
    models[f'{grp}_{q_no}'] = rfm

    # Evaluate the trained model on the validation dataset and store the 
    # evaluation accuracy in the `evaluation_dict`.
    inspector = rfm.make_inspector()
    inspector.evaluation()
    evaluation = rfm.evaluate(x=valid_ds,return_dict=True)
    evaluation_dict[q_no] = evaluation["accuracy"]         

    # Use the trained model to make predictions on the validation dataset and 
    # store the predicted values in the `prediction_df` dataframe.
    predict = rfm.predict(x=valid_ds)
    prediction_df.loc[valid_users, q_no-1] = predict.flatten()

### q_no 1 grp 0-4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Use /tmp/tmpdm3agzwm as temporary training directory
Reading training dataset...
Training dataset read in 0:00:06.868485. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:05:26.558051753+00:00 kernel.cc:1214] Loading model from path /tmp/tmpdm3agzwm/model/ with prefix 9f97b9f0ddb54bdc


Model trained in 0:00:05.979862
Compiling model...


[INFO 2023-05-16T13:05:27.221497218+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 163502 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:05:27.221681817+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:05:27.22259551+00:00 kernel.cc:1046] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.
### q_no 2 grp 0-4
Use /tmp/tmp6ikrecv6 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.625013. Found 18849 examples.
Training model...
Model trained in 0:00:03.706321


[INFO 2023-05-16T13:05:34.684457132+00:00 kernel.cc:1214] Loading model from path /tmp/tmp6ikrecv6/model/ with prefix a4eb066ad23c48f7
[INFO 2023-05-16T13:05:34.868233127+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 50820 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:05:34.868479968+00:00 kernel.cc:1046] Use fast generic engine


Compiling model...
Model compiled.
### q_no 3 grp 0-4
Use /tmp/tmpt2a1wb6o as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.631135. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:05:41.480314309+00:00 kernel.cc:1214] Loading model from path /tmp/tmpt2a1wb6o/model/ with prefix b010f4d10b804e01


Model trained in 0:00:04.688807
Compiling model...


[INFO 2023-05-16T13:05:41.836430754+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 96322 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:05:41.836713424+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:05:41.836762673+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 4 grp 5-12
Use /tmp/tmp820sr6q1 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.682603. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:05:53.625043252+00:00 kernel.cc:1214] Loading model from path /tmp/tmp820sr6q1/model/ with prefix 8d97965fb01448fe


Model trained in 0:00:05.891651
Compiling model...


[INFO 2023-05-16T13:05:54.299103956+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 175970 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:05:54.299175957+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:05:54.299233101+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 5 grp 5-12
Use /tmp/tmpgn076f7k as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.639706. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:06:01.07292359+00:00 kernel.cc:1214] Loading model from path /tmp/tmpgn076f7k/model/ with prefix e441398ce57a46e2


Model trained in 0:00:05.985843
Compiling model...


[INFO 2023-05-16T13:06:01.817959507+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 197486 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:06:01.81802818+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 6 grp 5-12
Use /tmp/tmpgoojbm1t as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.643690. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:06:12.101676086+00:00 kernel.cc:1214] Loading model from path /tmp/tmpgoojbm1t/model/ with prefix 1dab2ac08dc04942


Model trained in 0:00:05.740921
Compiling model...


[INFO 2023-05-16T13:06:12.752358803+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 174286 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:06:12.752418435+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:06:12.752460062+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 7 grp 5-12
Use /tmp/tmpretwm83g as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.642347. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:06:19.491333531+00:00 kernel.cc:1214] Loading model from path /tmp/tmpretwm83g/model/ with prefix 5ab41d41772e4d92


Model trained in 0:00:06.054299
Compiling model...


[INFO 2023-05-16T13:06:20.33346114+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 180944 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:06:20.333637446+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 8 grp 5-12
Use /tmp/tmpmppkp5qo as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.865917. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:06:28.528719612+00:00 kernel.cc:1214] Loading model from path /tmp/tmpmppkp5qo/model/ with prefix a7255c36de5643e4


Model trained in 0:00:06.443461
Compiling model...


[INFO 2023-05-16T13:06:29.137574998+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 148534 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:06:29.137765632+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:06:29.137802467+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 9 grp 5-12
Use /tmp/tmpyc2v06ob as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.720117. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:06:36.366427807+00:00 kernel.cc:1214] Loading model from path /tmp/tmpyc2v06ob/model/ with prefix 1ba2d9bf7de0487d


Model trained in 0:00:06.173126
Compiling model...


[INFO 2023-05-16T13:06:37.029328169+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 172382 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:06:37.029517512+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 10 grp 5-12
Use /tmp/tmpyv2he38h as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.661088. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:06:47.306710997+00:00 kernel.cc:1214] Loading model from path /tmp/tmpyv2he38h/model/ with prefix e39c3d2f8fb04e59


Model trained in 0:00:05.995116
Compiling model...


[INFO 2023-05-16T13:06:48.003027756+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 182310 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:06:48.003093513+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:06:48.003123947+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 11 grp 5-12
Use /tmp/tmpqglpp1vg as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.692137. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:06:58.410316139+00:00 kernel.cc:1214] Loading model from path /tmp/tmpqglpp1vg/model/ with prefix f057bbb5ee4d492b


Model trained in 0:00:05.732655
Compiling model...


[INFO 2023-05-16T13:06:59.019598312+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 157792 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:06:59.019678705+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:06:59.019710348+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 12 grp 5-12
Use /tmp/tmp2r9pf4e9 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.718082. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:07:05.533103993+00:00 kernel.cc:1214] Loading model from path /tmp/tmp2r9pf4e9/model/ with prefix 1845527db2094c44


Model trained in 0:00:05.381096
Compiling model...


[INFO 2023-05-16T13:07:06.030411834+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 131780 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:07:06.030916317+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 13 grp 5-12
Use /tmp/tmprc0qbrhe as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.667128. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:07:12.649328982+00:00 kernel.cc:1214] Loading model from path /tmp/tmprc0qbrhe/model/ with prefix 68d72bd994a04acd


Model trained in 0:00:05.699602
Compiling model...


[INFO 2023-05-16T13:07:13.270938885+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 158546 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:07:13.270998668+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:07:13.271028413+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 14 grp 13-22
Use /tmp/tmpz9bgteje as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.665265. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:07:24.124683791+00:00 kernel.cc:1214] Loading model from path /tmp/tmpz9bgteje/model/ with prefix 8d95c62ad4cf47af


Model trained in 0:00:06.108671
Compiling model...


[INFO 2023-05-16T13:07:24.86994953+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 184954 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:07:24.87023732+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:07:24.870419374+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 15 grp 13-22
Use /tmp/tmpbx1f9abl as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.676253. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:07:31.731609958+00:00 kernel.cc:1214] Loading model from path /tmp/tmpbx1f9abl/model/ with prefix 5ae46ac7f3fe4918


Model trained in 0:00:06.025169
Compiling model...


[INFO 2023-05-16T13:07:32.48677489+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 195254 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:07:32.486847609+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 16 grp 13-22
Use /tmp/tmpygkt4sd6 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.666304. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:07:42.285691608+00:00 kernel.cc:1214] Loading model from path /tmp/tmpygkt4sd6/model/ with prefix a352032c0f724e8d


Model trained in 0:00:05.019262
Compiling model...


[INFO 2023-05-16T13:07:42.693344538+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 109178 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:07:42.693503826+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:07:42.693535472+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 17 grp 13-22
Use /tmp/tmprf_m246i as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.658047. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:07:49.245768003+00:00 kernel.cc:1214] Loading model from path /tmp/tmprf_m246i/model/ with prefix dd60152a92f94ece


Model trained in 0:00:05.479447
Compiling model...


[INFO 2023-05-16T13:07:49.759043094+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 138226 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:07:49.759157505+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.
### q_no 18 grp 13-22
Use /tmp/tmp1hppt3ld as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.694890. Found 18849 examples.
Training model...


[INFO 2023-05-16T13:07:59.789164374+00:00 kernel.cc:1214] Loading model from path /tmp/tmp1hppt3ld/model/ with prefix dfe659bbb3cc436f


Model trained in 0:00:04.595550
Compiling model...


[INFO 2023-05-16T13:08:00.122097685+00:00 decision_forest.cc:661] Model loaded with 100 root(s), 88932 node(s), and 21 input feature(s).
[INFO 2023-05-16T13:08:00.122216942+00:00 abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 2023-05-16T13:08:00.122257514+00:00 kernel.cc:1046] Use fast generic engine


Model compiled.


# Inspect the Accuracy of the models.

In [16]:
for name, value in evaluation_dict.items():
  print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

question 1: accuracy 0.7307
question 2: accuracy 0.9756
question 3: accuracy 0.9351
question 4: accuracy 0.7942
question 5: accuracy 0.6270
question 6: accuracy 0.7902
question 7: accuracy 0.7460
question 8: accuracy 0.6374
question 9: accuracy 0.7664
question 10: accuracy 0.6017
question 11: accuracy 0.6535
question 12: accuracy 0.8697
question 13: accuracy 0.7208
question 14: accuracy 0.7327
question 15: accuracy 0.5992
question 16: accuracy 0.7490
question 17: accuracy 0.7036
question 18: accuracy 0.9516

Average accuracy 0.754685617155499


# Visualize the model

In [17]:
# tfdf.model_plotter.plot_model_in_colab(models['0-4_1'], tree_idx=0, max_depth=3)

# Variable importances

Variable importances generally indicate how much a feature contributes to the model predictions or quality. There are several ways to identify important features using TensorFlow Decision Forests. Let us pick one model from models dict and inspect it.

Let us list the available Variable Importances for Decision Trees:

In [18]:
inspector = models['0-4_1'].make_inspector()

print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)

Available variable importances:
	 NUM_AS_ROOT
	 SUM_SCORE
	 INV_MEAN_MIN_DEPTH
	 NUM_NODES


In [19]:
# Each line is: (feature name, (index of the feature), importance score)
inspector.variable_importances()["NUM_AS_ROOT"]

[("room_fqid_nunique" (1; #16), 24.0),
 ("page" (1; #10), 17.0),
 ("page_std" (1; #11), 15.0),
 ("name_nunique" (1; #9), 13.0),
 ("elapsed_time" (1; #1), 12.0),
 ("elapsed_time_std" (1; #2), 9.0),
 ("event_name_nunique" (1; #3), 4.0),
 ("level" (1; #7), 4.0),
 ("fqid_nunique" (1; #4), 1.0),
 ("text_fqid_nunique" (1; #21), 1.0)]

# Threshold-Moving for Imbalanced Classification

In [20]:
# Create a dataframe of required size:
# (no: of users in validation set x no: of questions) initialized to zero values
# to store true values of the label `correct`. 
true_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)
for i in range(18):
    # Get the true labels.
    tmp = labels.loc[labels.q == i+1].set_index('session').loc[VALID_USER_LIST]
    true_df[i] = tmp.correct.values

max_score = 0; best_threshold = 0

# Loop through threshold values from 0.4 to 0.8 and select the threshold with 
# the highest `F1 score`.
for threshold in np.arange(0.4,0.8,0.01):
    metric = tfa.metrics.F1Score(num_classes=2,average="macro",threshold=threshold)
    y_true = tf.one_hot(true_df.values.reshape((-1)), depth=2)
    y_pred = tf.one_hot((prediction_df.values.reshape((-1))>threshold).astype('int'), depth=2)
    metric.update_state(y_true, y_pred)
    f1_score = metric.result().numpy()
    if f1_score > max_score:
        max_score = f1_score
        best_threshold = threshold
        
print("Best threshold ", best_threshold, "\tF1 score ", max_score)

Best threshold  0.7000000000000003 	F1 score  0.6714021


# Submission

Here you'll use the `best_threshold` calculate in the previous cell

In [21]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook


import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        gbtm = models[f'{grp}_{t}']
        test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df.loc[:, test_df.columns != 'level_group'])
        predictions = gbtm.predict(test_ds)
        mask = sample_submission.session_id.str.contains(f'q{t}')
        n_predictions = (predictions > best_threshold).astype(int)
        sample_submission.loc[mask,'correct'] = n_predictions.flatten()
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [22]:
! head submission.csv

session_id,correct
20090109393214576_q1,1
20090109393214576_q2,1
20090109393214576_q3,1
20090109393214576_q4,1
20090109393214576_q5,0
20090109393214576_q6,1
20090109393214576_q7,1
20090109393214576_q8,1
20090109393214576_q9,1
