In [73]:
#import necessary packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, GroupKFold

from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import time

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
trlabels = pd.read_csv("train_labels.csv")
sample = pd.read_csv("sample_submission.csv")

In [4]:
trlabels['session'] = trlabels.session_id.apply(lambda s: int(s.split('_')[0]))
trlabels['question'] = trlabels.session_id.apply(lambda s: int(s.split('_')[1][1:]))

In [5]:
# Categorical features 
CATS = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']

# Numerical features
NUMS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

# define the function to create the features
def feature_engineer(train):
    dfs = []
    for c in CATS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [6]:
new_train = feature_engineer(train)
FEATURES = [c for c in new_train.columns if c != 'level_group'] # all features except the target
print('We will train with', len(FEATURES) ,'features')
ALL_USERS = new_train.index.unique() # treat each index as a user
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 21 features
We will train with 23562 users info


In [87]:
def getPredictions(new_train, clf):
    gkf = GroupKFold(n_splits=15)
    oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
    
    # COMPUTE CV SCORE WITH 5 GROUP K FOLD
    for i, (train_index, test_index) in enumerate(gkf.split(X=new_train, groups=new_train.index)):
        
        # ITERATE THRU QUESTIONS 1 THRU 18
        for t in range(1,19):

            # USE THIS TRAIN DATA WITH THESE QUESTIONS
            if t<=3: grp = '0-4'
            elif t<=13: grp = '5-12'
            elif t<=22: grp = '13-22'

            # TRAIN DATA
            train_x = new_train.iloc[train_index]
            train_x = train_x.loc[train_x.level_group == grp]
            #print(train_x)
            train_users = train_x.index.values
            train_y = trlabels.loc[trlabels.question==t].set_index('session').loc[train_users]
        
            # VALID DATA
            valid_x = new_train.iloc[test_index]
            valid_x = valid_x.loc[valid_x.level_group == grp]
            valid_users = valid_x.index.values
            valid_y = trlabels.loc[trlabels.question ==t].set_index('session').loc[valid_users]

            clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'])

            oof.loc[valid_users, t] = clf.predict(valid_x[FEATURES].astype('float32'))
    return oof

In [None]:
linear = getPredictions(new_train, LinearRegression())
tree = getPredictions(new_train, DecisionTreeClassifier(max_depth = 5, min_samples_split = 4, random_state= 0))

In [None]:
#reformat dataframe
linear = linear.stack().reset_index()
tree = tree.stack().reset_index()

linear.rename(columns = {'session_id':'session', 'level_1':'Pquestion', 0 :'Pcorrect'}, inplace = True)
tree.rename(columns = {'session_id':'session', 'level_1':'Pquestion', 0 :'Pcorrect'}, inplace = True)

linear['Pcorrect'] = linear['Pcorrect'].astype('int')
tree['Pcorrect'] = tree['Pcorrect'].astype('int')

linear = linear.sort_values(by=['Pquestion'])
tree = tree.sort_values(by=['Pquestion'])

In [None]:
#put actual correct label and predicted label together
linear_combined = pd.merge(trlabels[['session', 'correct', 'question']], linear, left_on='session', right_on='session', how='left')

tree_combined = pd.merge(trlabels[['session', 'correct', 'question']], tree, left_on='session', right_on='session', how='left')

#sort new dataframe
linear_combined.sort_values(by=['question', 'Pquestion'])
tree_combined.sort_values(by=['question', 'Pquestion'])

In [None]:
#FIND ACCURACY
print(classification_report(tree_combined['correct'], tree_combined['Pcorrect'], digits = 3))

In [None]:
#errror when finding accuracy for trees 
print(classification_report(linear_combined['correct'], linear_combined['Pcorrect'], digits = 3, 
                            zero_division= 0, labels = [0,1]))