In [5]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import lightgbm as lgb

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [6]:
%%time
keep_cols = ['event_id', 'game_session', 'installation_id', 'event_count', 'event_code',
             'title', 'game_time', 'type', 'world']
train = pd.read_csv('/input/data-science-bowl-2019/train.csv')
test = pd.read_csv('/input/data-science-bowl-2019/test.csv')
train_labels = pd.read_csv('/input/data-science-bowl-2019/train_labels.csv')
submission = pd.read_csv('/input/data-science-bowl-2019/sample_submission.csv')

Wall time: 59.1 s


In [7]:
test_assess = test[test['type'] == 'Assessment'].copy()
test_labels = submission.copy()
test_labels['title'] = test_labels['installation_id'] \
    .apply(lambda x: test_assess[test_assess['installation_id'] == x].iloc[-1].title)

In [8]:
def compute_time_states(group, col):
    return group[['installation_id', col, 'event_count', 'game_time']] \
    .groupby(['installation_id', col]).agg([np.mean, np.sum, np.std]) \
    .reset_index().pivot(columns=col, index='installation_id')

In [9]:
def group_and_reduce(df, df_labels):
    df = df[df['installation_id'].isin(df_labels['installation_id'].unique())]
    
    group_game_time = df.drop(columns=['event_id', 'event_code']) \
        .groupby(['game_session', 'installation_id', 'title', 'type', 'world']) \
        .max().reset_index()
    
    title_group = pd.get_dummies(group_game_time \
                                     .drop(columns=['game_session', 'event_count',
                                                    'game_time'])
                                 , columns=['title', 'type', 'world']) \
        .groupby('installation_id') \
        .sum()
    
    event_game_time_group = group_game_time[['installation_id', 'event_count', 'game_time']] \
        .groupby(['installation_id']) \
        .agg([np.sum, np.mean, np.std, np.min, np.max])
    
    world_time_states = compute_time_states(group_game_time, 'world')
    type_time_states = compute_time_states(group_game_time, 'type')
    
    return (
        title_group.join(event_game_time_group)
        .join(world_time_states)
        .join(type_time_states)
        .fillna(0)
    )

In [10]:
%%time
train_small = group_and_reduce(train, train_labels)
test_small = group_and_reduce(test, test_labels)

print(train_small.shape)
train_small.head()



(3614, 110)
Wall time: 2min 14s


Unnamed: 0_level_0,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),title_Bubble Bath,title_Bug Measurer (Activity),title_Cart Balancer (Assessment),title_Cauldron Filler (Assessment),...,"(game_time, mean, Clip)","(game_time, mean, Game)","(game_time, sum, Activity)","(game_time, sum, Assessment)","(game_time, sum, Clip)","(game_time, sum, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)"
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0006a69f,2.0,2.0,4.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,...,0.0,106966.45,3199695.0,236429.0,0.0,2139329.0,350054.566401,28330.303185,0.0,58189.254197
0006c192,1.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,2.0,...,0.0,88345.5,1210530.0,323061.0,0.0,530073.0,127422.7825,98940.202632,0.0,62500.291205
00129856,0.0,0.0,0.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,...,0.0,0.0,1021179.0,39742.0,0.0,0.0,130499.803239,28043.854942,0.0,0.0
001d0ed0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,158426.166667,92282.0,201941.0,0.0,950557.0,24694.997226,17737.374861,0.0,123969.846618
00225f67,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,129984.75,294517.0,35637.0,0.0,519939.0,49028.831364,12301.536672,0.0,65432.543128


In [11]:
def create_title_mode(train_labels):
    titles = train_labels['title'].unique()
    title2mode = {}
    
    for title in titles:
        mode = train_labels[train_labels['title'] == title]['accuracy_group'].value_counts().index[0]
        title2mode[title] = mode
    
    return title2mode

def add_title_mode(labels, title2mode):
    labels['title_mode'] = labels['title'].apply(lambda x: title2mode[x])
    return labels

In [12]:
title2mode = create_title_mode(train_labels)
train_labels = add_title_mode(train_labels, title2mode)
test_labels = add_title_mode(test_labels, title2mode)

In [17]:
def preprocess_train(train_labels, last_records_only=True):
    final_train = pd.get_dummies(train_labels.set_index('installation_id')
                                   .drop(columns=['num_correct', 'num_incorrect', 'accuracy', 'game_session'])
                                   .join(train_small), columns=['title'])
    if last_records_only:
        final_train = final_train.reset_index().groupby('installation_id').apply(lambda x: x.iloc[-1])
        final_train = final_train.drop(columns='installation_id')

    return final_train

def preprocess_test(test_labels, test_small):
    return pd.get_dummies(test_labels.set_index('installation_id').join(test_small), columns=['title'])   

In [18]:
final_train = preprocess_train(train_labels)
print(final_train.shape)
final_train.head()

(3614, 117)


Unnamed: 0_level_0,accuracy_group,title_mode,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),title_Bubble Bath,title_Bug Measurer (Activity),...,"(game_time, sum, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)",title_Bird Measurer (Assessment),title_Cart Balancer (Assessment),title_Cauldron Filler (Assessment),title_Chest Sorter (Assessment),title_Mushroom Sorter (Assessment)
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0006a69f,3,0,2.0,2.0,4.0,0.0,2.0,2.0,2.0,2.0,...,2139329.0,350054.566401,28330.303185,0.0,58189.254197,1,0,0,0,0
0006c192,0,3,1.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,...,530073.0,127422.7825,98940.202632,0.0,62500.291205,0,0,0,0,1
00129856,3,0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,2.0,...,0.0,130499.803239,28043.854942,0.0,0.0,1,0,0,0,0
001d0ed0,3,3,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,...,950557.0,24694.997226,17737.374861,0.0,123969.846618,0,0,0,0,1
00225f67,0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,519939.0,49028.831364,12301.536672,0.0,65432.543128,1,0,0,0,0


In [19]:
final_test = preprocess_test(test_labels,  test_small)
print(final_test.shape)
final_test.head()

(1000, 117)


Unnamed: 0_level_0,accuracy_group,title_mode,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),title_Bubble Bath,title_Bug Measurer (Activity),...,"(game_time, sum, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)",title_Bird Measurer (Assessment),title_Cart Balancer (Assessment),title_Cauldron Filler (Assessment),title_Chest Sorter (Assessment),title_Mushroom Sorter (Assessment)
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00abaee7,3,3,2.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,...,2285229.0,36886.664956,21240.073493,0.0,1038605.0,0,0,1,0,0
01242218,3,3,1.0,1.0,1.0,3.0,1.0,2.0,1.0,1.0,...,1420909.0,98521.245018,32761.743006,0.0,37797.81,0,1,0,0,0
017c5718,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6389.416875,0.0,0.0,0.0,0,0,0,0,1
01a44906,3,3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,77204.0,43064.217188,0.0,0.0,0.0,0,0,0,0,1
01bc6cb6,3,3,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,...,984880.0,0.0,0.0,0.0,178042.6,0,1,0,0,0


In [23]:
def cv_train(X, y, cv, **kwargs):
    
    models = []

    kf = KFold(n_splits=cv, random_state=2019)
    
    for train, test in kf.split(X):
        X_train, X_val, y_train, y_val = X[train], X[test], y[train], y[test]
        
        train_set = lgb.Dataset(X_train, y_train)
        val_set = lgb.Dataset(X_val, y_val)
        
        model = lgb.train(train_set=train_set, valid_sets=[train_set, val_set], **kwargs)
        models.append(model)
        
        if kwargs.get("verbose_eval"):
            print("\n" + "="*50 + "\n")
        
    return models

def cv_predict(models, X):
    return np.mean([model.predict(X) for model in models], axis=0)

In [25]:
X = final_train.drop(columns=['accuracy_group']).values
y = final_train['accuracy_group'].values

params = {
    'learning_rate': 0.01,
    'bagging_fraction': 0.95,
    'feature_fraction': 0.2,
    'max_height': 3,
    'lambda_l1': 10,
    'lambda_l2': 10,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'random_state': 2019
}

models = cv_train(X, y, cv=10, params=params, num_boost_round=1000,
                  early_stopping_rounds=100, verbose_eval=500
                 )

Training until validation scores don't improve for 100 rounds
[500]	training's multi_logloss: 0.922009	valid_1's multi_logloss: 0.987031
[1000]	training's multi_logloss: 0.820585	valid_1's multi_logloss: 0.953917
Did not meet early stopping. Best iteration is:
[1000]	training's multi_logloss: 0.820585	valid_1's multi_logloss: 0.953917


Training until validation scores don't improve for 100 rounds
[500]	training's multi_logloss: 0.915911	valid_1's multi_logloss: 1.02331
[1000]	training's multi_logloss: 0.814876	valid_1's multi_logloss: 1.00596
Did not meet early stopping. Best iteration is:
[1000]	training's multi_logloss: 0.814876	valid_1's multi_logloss: 1.00596


Training until validation scores don't improve for 100 rounds
[500]	training's multi_logloss: 0.916114	valid_1's multi_logloss: 1.04186
[1000]	training's multi_logloss: 0.81556	valid_1's multi_logloss: 1.01739
Did not meet early stopping. Best iteration is:
[1000]	training's multi_logloss: 0.81556	valid_1's multi_logloss: 1