In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 150)
import os

from collections import Counter
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import KFold, StratifiedKFold
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')
from numba import jit
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb

In [2]:
for _, _, files in os.walk('../input/data-science-bowl-2019'):
    print(files)

['test.csv', 'specs.csv', 'train.csv', 'train_labels.csv', 'sample_submission.csv']


In [3]:
def read_csv():
    BASE_PATH = '../input/data-science-bowl-2019/'
    #train
    print('Reading train.csv as DataFrame...')
    train = pd.read_csv(BASE_PATH + 'train.csv')
    print('Completed, train have {} columns, {} rows.'.format(train.shape[0], train.shape[1]))
    #train_lebels
    print('Reading train_labels.csv as DataFrame...')
    train_labels = pd.read_csv(BASE_PATH + 'train_labels.csv')
    print('Completed, train_labels have {} columns, {} rows.'.format(train_labels.shape[0], train_labels.shape[1]))
    #test
    print('Reading test.csv as DataFrame...')
    test = pd.read_csv(BASE_PATH + 'test.csv')
    print('Completed, test have {} columns, {} rows.'.format(test.shape[0], test.shape[1]))
    #specs
    print('Reading specs.csv as DataFrame...')
    specs = pd.read_csv(BASE_PATH + 'specs.csv')
    print('Completed, specs have {} columns, {} rows.'.format(specs.shape[0], specs.shape[1]))
    #sample_submission
    print('Reading sample_submission.csv as DataFrame...')
    sample_submission = pd.read_csv(BASE_PATH + 'sample_submission.csv')
    print('Completed, sample_submission have {} columns, {} rows.'.format(sample_submission.shape[0], sample_submission.shape[1]))
    
    return train, train_labels, test, specs, sample_submission

In [4]:
def feature_encoder(train, test):
    print('Encoding feature...')
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    train['world_type'] = list(map(lambda x, y: str(x) + '_' + str(y), train['world'], train['type']))
    test['world_type'] = list(map(lambda x, y: str(x) + '_' + str(y), test['world'], test['type']))
    
    def encoder(feature):
        list_ = list(set(train[feature].unique()).union(set(test[feature].unique())))
        map_ = dict(zip(list_, np.arange(len(list_))))
        labels = dict(zip(np.arange(len(list_)), list_))
        return list_, map_, labels
    
    title_list, title_map, title_labels = encoder('title')
    world_list, world_map, world_labels = encoder('world')
    eventId_list, _, _ = encoder('event_id')
    eventCode_list, _, _ = encoder('event_code')
    title_eventCode_list, _, _ = encoder('title_event_code')
    world_type_list, _, _ = encoder('world_type')
    asses_title_list = list(set(train[train.type == 'Assessment']['title'].unique()).union(set(test[test.type == 'Assessment']['title'].unique())))
    attempt_code = dict(zip(asses_title_list, (np.ones(len(asses_title_list)) * 4100).astype(int)))
    attempt_code['Bird Measurer (Assessment)'] = 4110

    train['title'] = train['title'].map(title_map)
    test['title'] = test['title'].map(title_map)
    train['world'] = train['world'].map(world_map)
    test['world'] = test['world'].map(world_map)
    print('Encoding completed.')
    return train, test, title_list, title_map, title_labels, world_list, world_map, world_labels, eventId_list, eventCode_list, title_eventCode_list, world_type_list, asses_title_list, attempt_code

In [5]:
def read_data(df, test_set=False):
    #count
    type_count = {'Clip': 0, 'Activity': 0, 'Assessment': 0, 'Game': 0}
    asses_count = {'acc_all': 0, 'acc_true': 0, 'acc_false': 0}
    group_count = {n: 0 for n in np.arange(4)}
    title_count = {ti: 0 for ti in title_list}
    world_count = {wo: 0 for wo in world_list}
    assesTrue_title_count = {at: 0 for at in asses_title_list}
    assesFalse_title_count = {af: 0 for af in asses_title_list}
    eventId_count = {ei: 0 for ei in eventId_list}
    eventCode_count = {ec: 0 for ec in eventCode_list}
    title_eventCode_count = {te: 0 for te in title_eventCode_list}
    world_type_count = {wt: 0 for wt in world_type_list}
    #time count
    title_time = {str(ti)+'_t': 0 for ti in title_list}
    world_time = {str(wo)+'_t': 0 for wo in world_list}
    
    features = []
    mean_time = []
    acc_time = 0
    
    for i, sess in df.groupby('game_session', sort=False):
        title = sess.title.iloc[0]
        world = sess.world.iloc[0]
        sess_type = sess.type.iloc[0]
        time = int(sess.game_time.iloc[-1] / 1000)
        
        if sess_type != 'Assessment':
            acc_time += time
        
        if (sess_type == 'Assessment') & (test_set or len(sess) > 1):
            
            feature = type_count.copy()
            feature['installation_id'] = sess.installation_id.iloc[0]
            feature['title'] = title
            feature['true_record'] = assesTrue_title_count[title_labels[title]]
            feature['false_record'] = assesFalse_title_count[title_labels[title]]
            feature['acc_play_time'] = acc_time
            
            #time
            if mean_time == []:
                feature['asses_time_mean'] = 0
                feature['asses_time_std'] = 0
            else:
                feature['asses_time_mean'] = np.mean(mean_time)
                feature['asses_time_std'] = np.std(mean_time)
            mean_time.append((sess.game_time.iloc[-1] - sess.game_time.iloc[0])/1000)
                
            #accuracy
            attempt_all = sess[sess.event_code == attempt_code[title_labels[title]]]['event_data'].shape[0]
            attempt_true = sess[sess.event_code == attempt_code[title_labels[title]]]['event_data'].str.contains('true').sum()
            attempt_false = sess[sess.event_code == attempt_code[title_labels[title]]]['event_data'].str.contains('false').sum()
            accuracy_rate = attempt_true / attempt_all if attempt_all != 0 else 0
            if accuracy_rate == 0:
                feature['accuracy_group'] = 0
            elif accuracy_rate == 1:
                feature['accuracy_group'] = 3
            elif accuracy_rate == 0.5:
                feature['accuracy_group'] = 2
            else:
                feature['accuracy_group'] = 1
            feature['acc_accuracy'] = asses_count['acc_true'] / asses_count['acc_all'] if asses_count['acc_all'] != 0 else 0
            feature.update(group_count)
            group_count[feature['accuracy_group']] += 1
            feature.update(asses_count)
            asses_count['acc_all'] += attempt_all 
            asses_count['acc_true'] += attempt_true
            asses_count['acc_false'] += attempt_false
            assesTrue_title_count[title_labels[title]] += attempt_true
            assesFalse_title_count[title_labels[title]] += attempt_false
            
            #Update count
            feature.update(title_count)
            feature.update(world_count)
            feature.update(eventId_count)
            feature.update(eventCode_count)
            feature.update(title_eventCode_count)
            feature.update(world_type_count)
            
            variety_features = [('var_event_code', eventCode_count),
                                ('var_event_id', eventId_count),
                                ('var_title', title_count),
                                ('var_title_event_code', title_eventCode_count)]
            
            for name, dict_counts in variety_features:
                arr = np.array(list(dict_counts.values()))
                feature[name] = np.count_nonzero(arr)
            
            #Update acctime
            feature.update(title_time)
            feature.update(world_time)
            
            if test_set:
                features = feature
            elif attempt_all != 0:
                features.append(feature)
                
        #count
        type_count[sess_type] += 1
        title_count[title_labels[title]] += 1
        world_count[world_labels[world]] += 1
            
        def count_updater(dic ,col):
            num_of_counter = Counter(sess[col])
            for i in num_of_counter.keys():
                dic[i] += num_of_counter[i]
                
        count_updater(eventId_count, 'event_id')
        count_updater(eventCode_count, 'event_code')
        count_updater(title_eventCode_count, 'title_event_code')
        count_updater(world_type_count, 'world_type')
        
        #acctime
        title_time[str(title_labels[title]) + '_t'] += time
        world_time[str(world_labels[world]) + '_t'] += time
    
    return features

In [6]:
def get_train_test(train, test):
    all_train = []
    all_test = []
    print('Getting train_encoder data...')
    for ins_id, data in tqdm(train.groupby('installation_id', sort=False), total=train.installation_id.nunique(), desc='installation_id'):
        all_train += read_data(data)
    print('Completed of train_en.')
    print('Getting test_encoder data...')
    for ins_id, data in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='installation_id'):
        tt = read_data(data, test_set=True)
        all_test.append(tt)
    print('Completed of test_en.')
    train_en = pd.DataFrame(all_train)
    test_en = pd.DataFrame(all_test)
    
    train_ohe = pd.get_dummies(train_en['title'], prefix='title')
    test_ohe = pd.get_dummies(test_en['title'], prefix='title')
    train_en = pd.concat([train_en, train_ohe], axis=1)
    test_en = pd.concat([test_en, test_ohe], axis=1)
    return train_en, test_en

In [7]:
def preprocess(train_en, test_en):
    for df in [train_en, test_en]:
        df['installation_session_count'] = df.groupby('installation_id')['Clip'].transform('count')
        df['installation_duration_time'] = df.groupby('installation_id')['asses_time_mean'].transform('mean')
        df['installation_title_nunique'] = df.groupby('installation_id')['title'].transform('nunique')
    train_feat = train_en.loc[(train_en.sum(axis=1) != 0), (train_en.sum(axis=0) != 0)].columns
    test_feat = test_en.loc[(test_en.sum(axis=1) != 0), (test_en.sum(axis=0) != 0)].columns
    features = [col for col in train_en.columns if col in train_feat or col in test_feat]
    return train_en[features], test_en[features]

In [8]:
train, train_labels, test, specs, sample_submission = read_csv()

Reading train.csv as DataFrame...
Completed, train have 11341042 columns, 11 rows.
Reading train_labels.csv as DataFrame...
Completed, train_labels have 17690 columns, 7 rows.
Reading test.csv as DataFrame...
Completed, test have 1156414 columns, 11 rows.
Reading specs.csv as DataFrame...
Completed, specs have 386 columns, 3 rows.
Reading sample_submission.csv as DataFrame...
Completed, sample_submission have 1000 columns, 2 rows.


In [9]:
keep_id = train[train.type == "Assessment"][['installation_id']].drop_duplicates()
train = pd.merge(train, keep_id, on="installation_id", how="inner")
print(f'remove installation_id which never made assessment')
print(f'reduce train to: {train.shape[0]} columns')

remove installation_id which never made assessment
reduce train to: 8294138 columns


In [10]:
train_s, test_s, title_list, title_map, title_labels, world_list, world_map, world_labels, eventId_list, eventCode_list, title_eventCode_list, world_type_list, asses_title_list, attempt_code = feature_encoder(train, test)
train_en, test_en = get_train_test(train_s, test_s)

Encoding feature...
Encoding completed.
Getting train_encoder data...


HBox(children=(IntProgress(value=0, description='installation_id', max=4242, style=ProgressStyle(description_w…


Completed of train_en.
Getting test_encoder data...


HBox(children=(IntProgress(value=0, description='installation_id', max=1000, style=ProgressStyle(description_w…


Completed of test_en.


In [11]:
train_en, test_en = preprocess(train_en, test_en)

In [12]:
features = train_en.loc[(train_en.sum(axis=1) != 0), (train_en.sum(axis=0) != 0)].columns
features = [col for col in features if col not in ['accuracy_group', 'installation_id']]
categoricals = ['session_title']

In [13]:
counter = 0
to_remove = []
for feat_1 in features:
    for feat_2 in features:
        if feat_1 != feat_2 and feat_1 not in to_remove and feat_2 not in to_remove:
            corr = np.corrcoef(train_en[feat_1], train_en[feat_2])[0][1]
            if corr > 0.995:
                counter += 1
                to_remove.append(feat_2)
                print('{}: FEAT_1: {} FEAT_2: {} - Correlation: {}'.format(counter, feat_1, feat_2, corr))

1: FEAT_1: Clip FEAT_2: 27253bdc - Correlation: 0.9999999999999999
2: FEAT_1: Costume Box FEAT_2: Costume Box_2000 - Correlation: 1.0
3: FEAT_1: Happy Camel FEAT_2: d9c005dd - Correlation: 1.0
4: FEAT_1: Happy Camel FEAT_2: Happy Camel_2000 - Correlation: 1.0
5: FEAT_1: Crystal Caves - Level 1 FEAT_2: Crystal Caves - Level 1_2000 - Correlation: 1.0
6: FEAT_1: Lifting Heavy Things FEAT_2: Lifting Heavy Things_2000 - Correlation: 0.9999999999999999
7: FEAT_1: Slop Problem FEAT_2: Slop Problem_2000 - Correlation: 1.0
8: FEAT_1: Air Show FEAT_2: 15ba1109 - Correlation: 1.0
9: FEAT_1: Air Show FEAT_2: Air Show_2000 - Correlation: 1.0
10: FEAT_1: Crystal Caves - Level 3 FEAT_2: Crystal Caves - Level 3_2000 - Correlation: 1.0
11: FEAT_1: Heavy, Heavier, Heaviest FEAT_2: Heavy, Heavier, Heaviest_2000 - Correlation: 1.0
12: FEAT_1: Bug Measurer (Activity) FEAT_2: c7f7f0e1 - Correlation: 1.0
13: FEAT_1: Bug Measurer (Activity) FEAT_2: Bug Measurer (Activity)_2000 - Correlation: 1.0
14: FEAT_1: W

In [14]:
y = train_en['accuracy_group']
X = train_en.drop(columns=(to_remove + ['accuracy_group', 'title']), axis=1)
test_predict = test_en.drop(columns=(to_remove + ['accuracy_group', 'installation_id', 'title']), axis=1)

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e


def eval_qwk_lgb(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """

    y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
    return 'cappa', qwk(y_true, y_pred), True


def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    y_pred[y_pred <= 1.07765539] = 0
    y_pred[np.where(np.logical_and(y_pred > 1.07765539, y_pred <= 1.76235851))] = 1
    y_pred[np.where(np.logical_and(y_pred > 1.76235851, y_pred <= 2.24528698))] = 2
    y_pred[y_pred > 2.24528698] = 3
    # y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
    return 'cappa', qwk(y_true, y_pred), True

In [16]:
params = {'n_estimators': 2000,
          'boostin_type': 'gdbt',
          'objective': 'regression',
          'metric': 'rmse',
          'subsample': 0.75,
          'subsample': 1,
          'learning': 0.04,
          'feature_fraction': 0.9,
          'max_depth': 15,
          'lambda_l1': 1,
          'lambda_l2': 1,
          #'verbose': 500,
          'early_stopping_rounds': 100,
          'eval_metric': 'cappa'
         }

In [17]:
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]

In [18]:
n_fold = 5
models = []
KFold = GroupKFold(n_splits=n_fold)
for i, (train_idx, val_idx) in enumerate(KFold.split(X, y, X['installation_id'])):
    X_r = X.drop(columns=['installation_id'], axis=1)
    X_train, y_train, X_val, y_val = X_r.iloc[train_idx], y.iloc[train_idx], X_r.iloc[val_idx], y.iloc[val_idx]
    print(f'Fold : {i}')
    model = lgb.LGBMRegressor()
    model = model.set_params(**params)
    model.fit(X=X_train, y=y_train,
              eval_set=[(X_train, y_train), (X_val, y_val)],
              eval_metric=eval_qwk_lgb_regr,
              early_stopping_rounds=params['early_stopping_rounds'])
    models.append(model)

Fold : 0
[1]	training's rmse: 1.21387	training's cappa: 0.204674	valid_1's rmse: 1.21702	valid_1's cappa: 0.230616
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.17765	training's cappa: 0.308179	valid_1's rmse: 1.18188	valid_1's cappa: 0.325176
[3]	training's rmse: 1.14728	training's cappa: 0.328405	valid_1's rmse: 1.15209	valid_1's cappa: 0.340008
[4]	training's rmse: 1.12143	training's cappa: 0.337484	valid_1's rmse: 1.12674	valid_1's cappa: 0.344512
[5]	training's rmse: 1.09939	training's cappa: 0.34796	valid_1's rmse: 1.10664	valid_1's cappa: 0.349442
[6]	training's rmse: 1.07961	training's cappa: 0.351975	valid_1's rmse: 1.08692	valid_1's cappa: 0.348807
[7]	training's rmse: 1.06294	training's cappa: 0.418126	valid_1's rmse: 1.07284	valid_1's cappa: 0.40325
[8]	training's rmse: 1.04804	training's cappa: 0.526664	valid_1's rmse: 1.05889	valid_1's cappa: 0.497801
[9]	training's rmse: 1.03533	training's cappa: 0.56433	valid_1's rmse: 1.04728	vali

In [19]:
from functools import partial
import scipy as sp
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [20]:
feature_importance = list(zip(X_r.columns, list(model.feature_importances_)))
dt = pd.DataFrame(feature_importance,  columns=['feature' ,'importance'])
dt.sort_values(by=['importance'], ascending=False).head(10)

Unnamed: 0,feature,importance
402,installation_duration_time,90
4,true_record,76
9,acc_accuracy,63
348,4070,36
0,Clip,35
335,3120,34
5,false_record,33
336,3121,31
7,asses_time_mean,30
337,4020,29


In [21]:
%%time
pr_sum = np.zeros((y.shape))
for model in models:
    pr = model.predict(X_r)
    pr_sum += pr
pr1 = pr_sum / len(models)
optR = OptimizedRounder()
optR.fit(pr1.reshape(-1,), y)
coefficients = optR.coefficients()

CPU times: user 3.69 s, sys: 284 ms, total: 3.98 s
Wall time: 2.9 s


In [22]:
coefficients

array([0.96609687, 1.70854351, 2.23756297])

In [23]:
opt_preds = optR.predict(pr1.reshape(-1, ), coefficients)
qwk(y, opt_preds)

0.7167347829413321

In [24]:
%%time
pr_sum = np.zeros((test_predict.shape[0]))
for model in models:
    pr = model.predict(test_predict)
    pr_sum += pr
y_pred = pr_sum / len(models)

CPU times: user 724 ms, sys: 4 ms, total: 728 ms
Wall time: 475 ms


In [25]:
y_pred[y_pred <= coefficients[0]] = 0
y_pred[np.where(np.logical_and(y_pred > coefficients[0], y_pred <= coefficients[1]))] = 1
y_pred[np.where(np.logical_and(y_pred > coefficients[1], y_pred <= coefficients[2]))] = 2
y_pred[y_pred > coefficients[2]] = 3

In [26]:
sample_submission['accuracy_group'] = y_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)