In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data vizualisation
import seaborn as sns # data vizualisation

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
root = '/kaggle/input/data-science-bowl-2019/'

print('Reading train.csv file ...', end='')
train = pd.read_csv(root + 'train.csv')  # training data
print(' Done!')

print('Reading train_labels.csv file ...', end='')
train_labels = pd.read_csv(root + 'train_labels.csv') 
print(' Done!')

print('Reading test.csv file ...', end='')
test = pd.read_csv(root + 'test.csv')
print(' Done!')

print('Reading specs.csv file ...', end='')
specs = pd.read_csv(root + 'specs.csv')
print(' Done!')

print('Reading sample_submission.csv file ...', end='')
sample_sub = pd.read_csv(root + 'sample_submission.csv')
print(' Done!')

print('All data imported')

In [None]:
print('\t Shapes:')
print('train.csv \t - {} rows and {} columns'.format(train.shape[0], train.shape[1]))

print('train_labels.csv - {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))
print('test.csv \t - {} rows and {} columns'.format(test.shape[0], test.shape[1]))
print('specs.csv \t - {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))
print('sample_sub.csv \t - {} rows and {} columns'.format(sample_sub.shape[0], sample_sub.shape[1]))

What variables do we have ?

In [None]:
train.head()

These are the main data files which contain the gameplay events.

* `event_id` - Randomly generated unique identifier for the event type. Maps to `event_id` column in **specs** table.
* `game_session` - Randomly generated unique identifier grouping events within a single game or video play session.
* `timestamp` - Client-generated datetime 
* `event_data` - Semi-structured JSON formatted string containing the events parameters. Default fields are: `event_count`, `event_code`, and `game_time`; otherwise fields are determined by the event type.
* `installation_id` - Randomly generated unique identifier grouping game sessions within a single installed application instance.
* `event_count` - Incremental counter of events within a game session (offset at 1). Extracted from `event_data`.
* `event_code` - Identifier of the event 'class'. Unique per game, but may be duplicated across games. E.g. event code '2000' always identifies the 'Start Game' event for all games. Extracted from `event_data`.
* `game_time` - Time in milliseconds since the start of the game session. Extracted from `event_data`.
* `title` - Title of the game or video.
* `type` - Media type of the game or video. Possible values are: 'Game', 'Assessment', 'Activity', 'Clip'.
* `world` - The section of the application the game or video belongs to. Helpful to identify the educational curriculum goals of the media. Possible values are: 'NONE' (at the app's start screen), TREETOPCITY' (Length/Height), 'MAGMAPEAK' (Capacity/Displacement), 'CRYSTALCAVES' (Weight).

In [None]:
pd.to_datetime(train['timestamp'].head())

In [None]:
train_labels.head()

In [None]:
test.head()

**train_labels** is a transformation of the **train** data, on which we can train our models.
It seems that, in order to create a proper **train_labels** data frame, all the information we need to extract from **train** is in the `event_data` column.

The outcomes in this competition are grouped into 4 groups (labeled `accuracy_group` in the data):
* 3 : the assessment was solved on the first attempt     : `accuracy` = 1.0
* 2 : the assessment was solved on the second attempt    : `accuracy` = 0.5
* 1 : the assessment was solved after 3 or more attempts : 0 < `accuracy` < 0.5
* 0 : the assessment was never solved                    : `accuracy` = 0.0


What does the submission should look like:

In [None]:
sample_sub.head()

Because the training data is so large, we will take a random sample of it for plotting. Since we are doing this at random it will speed up the time it takes to plot, and should still give us a a good view of the data's format.

In [None]:
train_ = train.sample(100000)

Now we have to find how to create the variables `num_correct` and `num_incorrect` in order to create `accuracy` and then `accuracy_group`.

For that, whenever ***"correct":true*** appears in the column `event_data`, it means that that player succeed the current event (recognizable by `event_id`). In the same way, whenever ***"correct":false*** appears in the column `event_data` it means that that player failed the current event.

So, we're looking for the number of appearances of ***"correct":true*** and ***"correct":false*** in the same `game_session`.

In [None]:
def make_labels(data):
    """
    Input : Data in the same shape as train.csv
    Output : Data in the same shape as train_labels.csv
    """
    # Which rows concerns assessments ?
    mask_assessment = data['event_data'].str.contains('event_code":4100') | data['event_data'].str.contains('event_code":4110')
    # Which rows contains correct assessments ?
    mask_correct = data.loc[mask_assessment,'event_data'].str.contains('correct":true')
    # Which rows contains incorrect assessments ?
    mask_incorrect = data.loc[mask_assessment,'event_data'].str.contains('correct":false')

    
    
    
    
    num_correct = pd.DataFrame(data[data['event_data'].str.contains('correct":true')].groupby('game_session').count()["event_id"].rename('num_correct'))
    num_incorrect = pd.DataFrame(data[data['event_data'].str.contains('correct":false')].groupby('game_session').count()["event_id"].rename('num_incorrect'))

    labels_ = pd.DataFrame(num_correct.merge(num_incorrect, how='outer', left_on=num_correct.index, right_on=num_incorrect.index)).fillna(0)
    labels_ = labels_.rename(columns={'key_0':'game_session', 'num_correct_x':'num_correct', 'num_correct_y':'num_incorrect'})
    labels_ = labels_.merge(train_[['installation_id', 'game_session', 'title']], how='inner', left_on='game_session', right_on='game_session')
    labels_ = labels_.drop_duplicates()
    
    labels_["accuracy"] = labels_["num_correct"]/(labels_["num_correct"]+labels_["num_incorrect"])
    labels_["accuracy_group"] = labels_["accuracy"].apply(lambda x: 0 if x==0 else (1 if x<0.5 else (2 if x<0.9 else 3)))
    
    return labels_

In [None]:
mask_assessment = train_['event_data'].str.contains('event_code":4100') | train_['event_data'].str.contains('event_code":4100')
mask_correct = train_.loc[mask_assessment,'event_data'].str.contains('correct":true')

In [None]:
train_.loc[mask_assessment, 'event_data'].str.contains('correct":true')

In [None]:
num_correct = pd.DataFrame('')

In [None]:
train[train['event_data'].str.contains('correct":true')].shape

In [None]:
train_labels.shape

In [None]:
len(train['game_session'].unique())

In [None]:
train_[train_['event_data'].str.contains('true')]

train_[train_['event_data'].str.contains('true')]          - 

In [None]:
labels_ = make_labels(train_)
labels_['num_correct'].unique()

In [None]:
labels_ = pd.DataFrame(num_correct.merge(num_incorrect, how='outer', left_on=num_correct.index, right_on=num_incorrect.index)).fillna(0)
labels_ = labels_.rename(columns={'key_0':'game_session', 'num_correct_x':'num_correct', 'num_correct_y':'num_incorrect'})
labels_["accuracy"] = labels_["num_correct"]/(labels_["num_correct"]+labels_["num_incorrect"])
labels_["accuracy_group"] = labels_["accuracy"].apply(lambda x: 0 if x==0 else (1 if x<0.5 else (2 if x<0.9 else 3)))
labels_ = labels_.merge(train_[['installation_id', 'game_session', 'title']], how='inner', left_on='game_session', right_on='game_session')
labels_

In [None]:
train_[['installation_id', 'title']].head()

In [None]:
print('\t Lengths:')
print('num_correct   - {} rows'.format(num_correct.shape[0]))
print('labels_   - {} rows'.format(labels_.shape[0]))

In [None]:
num_correct.index

In [None]:
num_correct.head()

In [None]:
num_correct = train_[train_['event_data'].str.contains('correct":true')].groupby('game_session').count().iloc[:,0]
num_incorrect = train_[train_['event_data'].str.contains('correct":false')].groupby('game_session').count().iloc[:,0]

In [None]:
print('\t Lengths:')
print('num_correct   - {} rows'.format(num_correct.shape[0]))
print('num_incorrect - {} rows'.format(num_incorrect.shape[0]))

In [None]:
train_

In [None]:
pd.merge(train_, num_correct, on='game_session')

In [None]:
mt = MainTransformer()
ft = FeatureTransformer()
transformers = {'ft': ft}
regressor_model1 = RegressorModel(model_wrapper=LGBWrapper_regr())
regressor_model1.fit(X=reduce_train, y=y, folds=folds, params=params, preprocesser=mt, transformers=transformers,
                    eval_metric='cappa', cols_to_drop=cols_to_drop)

In [None]:
from functools import partial
import scipy as sp
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [None]:
%%time
pr1 = regressor_model1.predict(reduce_train)

optR = OptimizedRounder()
optR.fit(pr1.reshape(-1,), y)
coefficients = optR.coefficients()

In [None]:
opt_preds = optR.predict(pr1.reshape(-1, ), coefficients)
qwk(y, opt_preds)

In [None]:
# some coefficients calculated by me.
pr1 = regressor_model1.predict(reduce_test)
pr1[pr1 <= 1.12232214] = 0
pr1[np.where(np.logical_and(pr1 > 1.12232214, pr1 <= 1.73925866))] = 1
pr1[np.where(np.logical_and(pr1 > 1.73925866, pr1 <= 2.22506454))] = 2
pr1[pr1 > 2.22506454] = 3

In [None]:
sample_submission['accuracy_group'] = pr1.astype(int)
sample_submission.to_csv('submission.csv', index=False)

sample_submission['accuracy_group'].value_counts(normalize=True)