In [1]:
#import necessary packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import time

# Data Loading and Visualization(Data Preprosessing)

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
trlabels = pd.read_csv("train_labels.csv")

sample = pd.read_csv("sample_submission.csv")

MemoryError: Unable to allocate 1.37 GiB for an array with shape (7, 26296946) and data type object

In [None]:
train

# Feature Engineering

In [None]:
# A distinction is made between categorical (string), numerical (floats or integers) and 
# event (string).
CATS = ['event_name', 'fqid', 'room_fqid', 'text']
NUMS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

# https://www.kaggle.com/code/kimtaehun/lightgbm-baseline-with-aggregated-log-data
EVENTS = ['navigate_click','person_click','cutscene_click','object_click',
          'map_hover','notification_click','map_click','observation_click',
          'checkpoint']

In [None]:
# Based on the different data defined above we engineer features that are grouped by, 
# and aggregated per unique session id and level group. Thus, resulting in a single row 
# for every session id containing the aggregated value of every feature.
# Note, that the questions within one session are grouped per level group,
# i.e. the 18 questions are distributed over 22 different levels, which makes up a 
# complete session.
def feature_engineer(train):   
    dfs = []
    for c in CATS:
        #Define unique number of categorical data per level group within an unique session
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        #Define the mean of the numerical data per level group within an unique session
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')        
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        #Define the std of the numerical data per level group within an unique session
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')         
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS: 
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        #Define the sum of the occurences of an event per level group within an unique session
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')         
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS,axis=1)
        
    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [None]:
# create the features per piece of training data and display the head
df = feature_engineer(train)
df.head()

## Separateing level_groups and training models

In [None]:
gkf = GroupKFold(n_splits=5)

# Tip: try different kfold variations
# kf = KFold(n_splits=5) # example if you would like to try regular KFold

# oof == out of fold (predictions) collected in a dataframe
# this means all predictions on test subset in each fold are collected
# in the oof dataframe
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS) 
models = {}

# COMPUTE CV SCORE WITH 5 GROUP K FOLD
for i, (train_index, test_index) in enumerate(gkf.split(X=df, groups=df.index)):
    print('#'*25)
    print('### Fold',i+1)
    print('#'*25)
    
    # Hyperparameters --> Tip: (automatic) tuning, e.g. GridSearchCV, RandomizedSearhCV,
    # or better yet: Bayesian Optimization
    model_params = {#this bit is likely to chage given different models that will be implemented}
    
    # ITERATE THRU QUESTIONS 1 THRU 18
    for t in range(1,19):
        
        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'
            
        # TRAIN DATA
        train_x = df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==t].set_index('session').loc[train_users]
        
        # VALID DATA
        valid_x = df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q==t].set_index('session').loc[valid_users]
        
        # TRAIN MODEL        
        clf =  DecisionTreeClassifier(**model_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[ (valid_x[FEATURES].astype('float32'), valid_y['correct']) ],
                verbose=0)
        print(f'{t}({clf.best_ntree_limit}), ',end='')
        
        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]
        
    print()
