In [9]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 150)
import os

from collections import Counter
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import KFold, StratifiedKFold
import tensorflow as tf

In [10]:
for _, _, files in os.walk('/input/data-science-bowl-2019'):
    print(files)

['sample_submission.csv', 'specs.csv', 'test.csv', 'train.csv', 'train_labels.csv']


In [11]:
def read_csv():
    BASE_PATH = '/input/data-science-bowl-2019/'
    #train
    print('Reading train.csv as DataFrame...')
    train = pd.read_csv(BASE_PATH + 'train.csv')
    print('Completed, train have {} columns, {} rows.'.format(train.shape[0], train.shape[1]))
    #train_lebels
    print('Reading train_labels.csv as DataFrame...')
    train_labels = pd.read_csv(BASE_PATH + 'train_labels.csv')
    print('Completed, train_labels have {} columns, {} rows.'.format(train_labels.shape[0], train_labels.shape[1]))
    #test
    print('Reading test.csv as DataFrame...')
    test = pd.read_csv(BASE_PATH + 'test.csv')
    print('Completed, test have {} columns, {} rows.'.format(test.shape[0], test.shape[1]))
    #specs
    print('Reading specs.csv as DataFrame...')
    specs = pd.read_csv(BASE_PATH + 'specs.csv')
    print('Completed, specs have {} columns, {} rows.'.format(specs.shape[0], specs.shape[1]))
    #sample_submission
    print('Reading sample_submission.csv as DataFrame...')
    sample_submission = pd.read_csv(BASE_PATH + 'sample_submission.csv')
    print('Completed, sample_submission have {} columns, {} rows.'.format(sample_submission.shape[0], sample_submission.shape[1]))
    
    return train, train_labels, test, specs, sample_submission

In [12]:
def feature_encoder(train, test):
    print('Encoding feature...')
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    train['world_type'] = list(map(lambda x, y: str(x) + '_' + str(y), train['world'], train['type']))
    test['world_type'] = list(map(lambda x, y: str(x) + '_' + str(y), test['world'], test['type']))
    
    def encoder(feature):
        list_ = list(set(train[feature].unique()).union(set(test[feature].unique())))
        map_ = dict(zip(list_, np.arange(len(list_))))
        labels = dict(zip(np.arange(len(list_)), list_))
        return list_, map_, labels
    
    title_list, title_map, title_labels = encoder('title')
    world_list, world_map, world_labels = encoder('world')
    eventId_list, _, _ = encoder('event_id')
    eventCode_list, _, _ = encoder('event_code')
    title_eventCode_list, _, _ = encoder('title_event_code')
    world_type_list, _, _ = encoder('world_type')
    asses_title_list = list(set(train[train.type == 'Assessment']['title'].unique()).union(set(test[test.type == 'Assessment']['title'].unique())))

    train['title'] = train['title'].map(title_map)
    test['title'] = test['title'].map(title_map)
    train['world'] = train['world'].map(world_map)
    test['world'] = test['world'].map(world_map)
    print('Encoding completed.')
    return train, test, title_list, title_map, title_labels, world_list, world_map, world_labels, eventId_list, eventCode_list, title_eventCode_list, world_type_list, asses_title_list

In [13]:
def read_data(df, test_set=False):
    #count
    type_count = {'Clip': 0, 'Activity': 0, 'Assessment': 0, 'Game': 0}
    asses_count = {'acc_all': 0, 'acc_true': 0, 'acc_false': 0}
    group_count = {n: 0 for n in np.arange(4)}
    title_count = {ti: 0 for ti in title_list}
    world_count = {wo: 0 for wo in world_list}
    assesTrue_title_count = {at: 0 for at in asses_title_list}
    assesFalse_title_count = {af: 0 for af in asses_title_list}
    eventId_count = {ei: 0 for ei in eventId_list}
    eventCode_count = {ec: 0 for ec in eventCode_list}
    title_eventCode_count = {te: 0 for te in title_eventCode_list}
    world_type_count = {wt: 0 for wt in world_type_list}
    #time count
    title_time = {str(ti)+'_t': 0 for ti in title_list}
    world_time = {str(wo)+'_t': 0 for wo in world_list}
    
    features = []
    mean_time = []
    acc_time = 0
    assess_attempt_code = [4100, 4110]
    
    for i, sess in df.groupby('game_session', sort=False):
        title = sess.title.iloc[0]
        world = sess.world.iloc[0]
        sess_type = sess.type.iloc[0]
        time = int(sess.game_time.iloc[-1] / 1000)
        
        if sess_type != 'Assessment':
            acc_time += time
        
        if (sess_type == 'Assessment') & (test_set or len(sess) > 1):
            
            feature = type_count.copy()
            feature['installation_id'] = sess.installation_id.iloc[0]
            feature['title'] = title
            feature['true_record'] = assesTrue_title_count[title_labels[title]]
            feature['false_record'] = assesFalse_title_count[title_labels[title]]
            feature['acc_play_time'] = acc_time
            
            #time
            if mean_time == []:
                feature['asses_time_mean'] = 0
                feature['asses_time_std'] = 0
            else:
                feature['asses_time_mean'] = np.mean(mean_time)
                feature['asses_time_std'] = np.std(mean_time)
            mean_time.append((sess.game_time.iloc[-1] - sess.game_time.iloc[0])/1000)
                
            #accuracy
            attempt_all = sess[(sess.event_code == 4100) | (sess.event_code == 4110)]['event_data'].shape[0]
            attempt_true = sess[(sess.event_code == 4100) | (sess.event_code == 4110)]['event_data'].str.contains('true').sum()
            attempt_false = sess[(sess.event_code == 4100) | (sess.event_code == 4110)]['event_data'].str.contains('false').sum()
            accuracy_rate = attempt_true / attempt_all if attempt_all != 0 else 0
            if accuracy_rate == 0:
                feature['accuracy_group'] = 0
            elif accuracy_rate == 1:
                feature['accuracy_group'] = 3
            elif accuracy_rate == 0.5:
                feature['accuracy_group'] = 2
            else:
                feature['accuracy_group'] = 1
            feature['acc_accuracy'] = asses_count['acc_true'] / asses_count['acc_all'] if asses_count['acc_all'] != 0 else 0
            feature.update(group_count)
            group_count[feature['accuracy_group']] += 1
            feature.update(asses_count)
            asses_count['acc_all'] += attempt_all 
            asses_count['acc_true'] += attempt_true
            asses_count['acc_false'] += attempt_false
            assesTrue_title_count[title_labels[title]] = attempt_true
            assesFalse_title_count[title_labels[title]] = attempt_false
            
            #Update count
            feature.update(title_count)
            feature.update(world_count)
            feature.update(eventId_count)
            feature.update(eventCode_count)
            feature.update(title_eventCode_count)
            feature.update(world_type_count)
            
            variety_features = [('var_event_code', eventCode_count),
                                ('var_event_id', eventId_count),
                                ('var_title', title_count),
                                ('var_title_event_code', title_eventCode_count)]
            
            for name, dict_counts in variety_features:
                arr = np.array(list(dict_counts.values()))
                feature[name] = np.count_nonzero(arr)
            
            #Update acctime
            feature.update(title_time)
            feature.update(world_time)
            
            if test_set:
                features = feature
            else:
                features.append(feature)
                
        #count
        type_count[sess_type] += 1
        title_count[title_labels[title]] += 1
        world_count[world_labels[world]] += 1
            
        def count_updater(dic ,col):
            num_of_counter = Counter(sess[col])
            for i in num_of_counter.keys():
                dic[i] += num_of_counter[i]
                
        count_updater(eventId_count, 'event_id')
        count_updater(eventCode_count, 'event_code')
        count_updater(title_eventCode_count, 'title_event_code')
        count_updater(world_type_count, 'world_type')
        
        #acctime
        title_time[str(title_labels[title]) + '_t'] += time
        world_time[str(world_labels[world]) + '_t'] += time
    
    return features

In [14]:
def get_train_test(train, test):
    all_train = []
    all_test = []
    print('Getting train_encoder data...')
    for ins_id, data in tqdm(train.groupby('installation_id', sort=False), total=train.installation_id.nunique(), desc='installation_id'):
        all_train += read_data(data)
    print('Completed of train_en.')
    print('Getting test_encoder data...')
    for ins_id, data in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='installation_id'):
        tt = read_data(data, test_set=True)
        all_test.append(tt)
    print('Completed of test_en.')
    train_en = pd.DataFrame(all_train)
    test_en = pd.DataFrame(all_test)
    return train_en, test_en

In [15]:
def preprocess(train_en, test_en):
    for df in [train_en, test_en]:
        df['installation_session_count'] = df.groupby('installation_id')['Clip'].transform('count')
        df['installation_duration_time'] = df.groupby('installation_id')['asses_time_mean'].transform('mean')
        df['installation_title_nunique'] = df.groupby('installation_id')['title'].transform('nunique')
        
    return train_en, test_en

In [16]:
train, train_labels, test, specs, sample_submission = read_csv()

Reading train.csv as DataFrame...
Completed, train have 11341042 columns, 11 rows.
Reading train_labels.csv as DataFrame...
Completed, train_labels have 17690 columns, 7 rows.
Reading test.csv as DataFrame...
Completed, test have 1156414 columns, 11 rows.
Reading specs.csv as DataFrame...
Completed, specs have 386 columns, 3 rows.
Reading sample_submission.csv as DataFrame...
Completed, sample_submission have 1000 columns, 2 rows.


In [17]:
keep_id = train[train.type == "Assessment"][['installation_id']].drop_duplicates()
train = pd.merge(train, keep_id, on="installation_id", how="inner")
print(f'remove installation_id which never made assessment')
print(f'reduce train to: {train.shape[0]} columns')

remove installation_id which never made assessment
reduce train to: 8294138 columns


In [18]:
train, test, title_list, title_map, title_labels, world_list, world_map, world_labels, eventId_list, eventCode_list, title_eventCode_list, world_type_list, asses_title_list = feature_encoder(train, test)
train_en, test_en = get_train_test(train, test)

Encoding feature...
Encoding completed.
Getting train_encoder data...


HBox(children=(IntProgress(value=0, description='installation_id', max=4242, style=ProgressStyle(description_w…


Completed of train_en.
Getting test_encoder data...


HBox(children=(IntProgress(value=0, description='installation_id', max=1000, style=ProgressStyle(description_w…


Completed of test_en.


In [19]:
train_en, test_en = preprocess(train_en, test_en)

In [20]:
features = train_en.loc[(train_en.sum(axis=1) != 0), (train_en.sum(axis=0) != 0)].columns
features = [col for col in features if col not in ['accuracy_group', 'installation_id']]
categoricals = ['session_title']

In [21]:
counter = 0
to_remove = []
for feat_1 in features:
    for feat_2 in features:
        if feat_1 != feat_2 and feat_1 not in to_remove and feat_2 not in to_remove:
            corr = np.corrcoef(train_en[feat_1], train_en[feat_2])[0][1]
            if corr > 0.995:
                counter += 1
                to_remove.append(feat_2)
                print('{}: FEAT_1: {} FEAT_2: {} - Correlation: {}'.format(counter, feat_1, feat_2, corr))

1: FEAT_1: Clip FEAT_2: 27253bdc - Correlation: 1.0
2: FEAT_1: Bird Measurer (Assessment) FEAT_2: f56e0afc - Correlation: 1.0
3: FEAT_1: Bird Measurer (Assessment) FEAT_2: Bird Measurer (Assessment)_2000 - Correlation: 1.0
4: FEAT_1: Happy Camel FEAT_2: d9c005dd - Correlation: 1.0
5: FEAT_1: Happy Camel FEAT_2: Happy Camel_2000 - Correlation: 1.0
6: FEAT_1: Crystal Caves - Level 3 FEAT_2: Crystal Caves - Level 3_2000 - Correlation: 0.9999999999999999
7: FEAT_1: Crystal Caves - Level 2 FEAT_2: Crystal Caves - Level 2_2000 - Correlation: 0.9999999999999999
8: FEAT_1: Mushroom Sorter (Assessment) FEAT_2: db02c830 - Correlation: 0.9999988147664092
9: FEAT_1: Mushroom Sorter (Assessment) FEAT_2: 3bfd1a65 - Correlation: 1.0
10: FEAT_1: Mushroom Sorter (Assessment) FEAT_2: Mushroom Sorter (Assessment)_2025 - Correlation: 0.9999988147664092
11: FEAT_1: Mushroom Sorter (Assessment) FEAT_2: Mushroom Sorter (Assessment)_2000 - Correlation: 1.0
12: FEAT_1: Flower Waterer (Activity) FEAT_2: 9b01374

100: FEAT_1: b5053438 FEAT_2: d3268efa - Correlation: 0.999484815123522
101: FEAT_1: b5053438 FEAT_2: 28520915 - Correlation: 0.9990950947374551
102: FEAT_1: b5053438 FEAT_2: Cauldron Filler (Assessment)_3021 - Correlation: 0.999484815123522
103: FEAT_1: b5053438 FEAT_2: Cauldron Filler (Assessment)_2030 - Correlation: 0.9990950947374551
104: FEAT_1: b5053438 FEAT_2: Cauldron Filler (Assessment)_3121 - Correlation: 0.9999999999999998
105: FEAT_1: cc5087a3 FEAT_2: Crystals Rule_4010 - Correlation: 0.9999999999999998
106: FEAT_1: 4c2ec19f FEAT_2: Egg Dropper (Activity)_4025 - Correlation: 1.0
107: FEAT_1: 709b1251 FEAT_2: e3ff61fb - Correlation: 0.999572524598253
108: FEAT_1: 709b1251 FEAT_2: 7961e599 - Correlation: 0.9957962554268273
109: FEAT_1: 709b1251 FEAT_2: Dino Dive_3121 - Correlation: 1.0
110: FEAT_1: 709b1251 FEAT_2: Dino Dive_3021 - Correlation: 0.999572524598253
111: FEAT_1: 709b1251 FEAT_2: Dino Dive_2020 - Correlation: 0.9957962554268273
112: FEAT_1: 9d29771f FEAT_2: 3dfd4a

204: FEAT_1: a592d54e FEAT_2: Pan Balance_2030 - Correlation: 0.9968474912288281
205: FEAT_1: a592d54e FEAT_2: Pan Balance_2020 - Correlation: 1.0
206: FEAT_1: a592d54e FEAT_2: Pan Balance_3121 - Correlation: 0.9965233238739293
207: FEAT_1: 84b0e0c8 FEAT_2: ea321fb1 - Correlation: 0.9992626433101821
208: FEAT_1: 84b0e0c8 FEAT_2: Chicken Balancer (Activity)_3110 - Correlation: 0.9999999999999999
209: FEAT_1: 84b0e0c8 FEAT_2: Chicken Balancer (Activity)_3010 - Correlation: 0.9992626433101821
210: FEAT_1: 3a4be871 FEAT_2: Flower Waterer (Activity)_4080 - Correlation: 1.0
211: FEAT_1: 17113b36 FEAT_2: e37a2b78 - Correlation: 0.9981774724155311
212: FEAT_1: 17113b36 FEAT_2: ad2fc29c - Correlation: 0.9988381596234415
213: FEAT_1: 17113b36 FEAT_2: Bird Measurer (Assessment)_4110 - Correlation: 1.0
214: FEAT_1: 17113b36 FEAT_2: Bird Measurer (Assessment)_3020 - Correlation: 0.9988381596234415
215: FEAT_1: 17113b36 FEAT_2: Bird Measurer (Assessment)_3120 - Correlation: 0.9981774724155311
216: F

307: FEAT_1: beb0a7b9 FEAT_2: Fireworks (Activity)_3110 - Correlation: 0.9998524462332803
308: FEAT_1: f54238ee FEAT_2: Fireworks (Activity)_4090 - Correlation: 1.0
309: FEAT_1: 7040c096 FEAT_2: Scrub-A-Dub_4010 - Correlation: 1.0
310: FEAT_1: 53c6e11a FEAT_2: Leaf Leader_2075 - Correlation: 1.0
311: FEAT_1: ecc6157f FEAT_2: Cart Balancer (Assessment)_4080 - Correlation: 1.0
312: FEAT_1: f32856e4 FEAT_2: Leaf Leader_2020 - Correlation: 1.0
313: FEAT_1: 0d18d96c FEAT_2: Mushroom Sorter (Assessment)_4035 - Correlation: 0.9999999999999999
314: FEAT_1: c58186bf FEAT_2: Sandcastle Builder (Activity)_4035 - Correlation: 1.0
315: FEAT_1: 90efca10 FEAT_2: Bottle Filler (Activity)_4020 - Correlation: 1.0
316: FEAT_1: f71c4741 FEAT_2: f7e47413 - Correlation: 0.9999435624173822
317: FEAT_1: f71c4741 FEAT_2: Scrub-A-Dub_3010 - Correlation: 1.0
318: FEAT_1: f71c4741 FEAT_2: Scrub-A-Dub_3110 - Correlation: 0.9999435624173822
319: FEAT_1: d02b7a8e FEAT_2: All Star Sorting_4035 - Correlation: 0.999999

413: FEAT_1: 262136f4 FEAT_2: Leaf Leader_4020 - Correlation: 1.0
414: FEAT_1: 4074bac2 FEAT_2: Pan Balance_2010 - Correlation: 1.0
415: FEAT_1: 19967db1 FEAT_2: Chow Time_4090 - Correlation: 1.0
416: FEAT_1: 611485c5 FEAT_2: Fireworks (Activity)_4080 - Correlation: 0.9999999999999999
417: FEAT_1: b1d5101d FEAT_2: All Star Sorting_4095 - Correlation: 1.0
418: FEAT_1: bc8f2793 FEAT_2: Pan Balance_4035 - Correlation: 1.0
419: FEAT_1: b80e5e84 FEAT_2: 7ab78247 - Correlation: 0.9997119008494356
420: FEAT_1: b80e5e84 FEAT_2: Egg Dropper (Activity)_3010 - Correlation: 0.9997119008494356
421: FEAT_1: b80e5e84 FEAT_2: Egg Dropper (Activity)_3110 - Correlation: 0.9999999999999998
422: FEAT_1: 532a2afb FEAT_2: Cauldron Filler (Assessment)_2020 - Correlation: 1.0
423: FEAT_1: 33505eae FEAT_2: 2a512369 - Correlation: 0.9994595194498185
424: FEAT_1: 33505eae FEAT_2: Leaf Leader_3110 - Correlation: 0.9994595194498185
425: FEAT_1: 33505eae FEAT_2: Leaf Leader_3010 - Correlation: 0.9999999999999998
42

518: FEAT_1: 0db6d71d FEAT_2: Chest Sorter (Assessment)_4020 - Correlation: 0.9999999999999998
519: FEAT_1: 67aa2ada FEAT_2: Leaf Leader_4090 - Correlation: 1.0
520: FEAT_1: 91561152 FEAT_2: Cauldron Filler (Assessment)_4025 - Correlation: 1.0
521: FEAT_1: 3b2048ee FEAT_2: Leaf Leader_4095 - Correlation: 0.9999999999999998
522: FEAT_1: acf5c23f FEAT_2: Cart Balancer (Assessment)_4070 - Correlation: 1.0
523: FEAT_1: 51102b85 FEAT_2: Bird Measurer (Assessment)_4030 - Correlation: 1.0
524: FEAT_1: 795e4a37 FEAT_2: Cart Balancer (Assessment)_3010 - Correlation: 1.0
525: FEAT_1: 99ea62f3 FEAT_2: Bubble Bath_2083 - Correlation: 1.0
526: FEAT_1: 6043a2b4 FEAT_2: All Star Sorting_4090 - Correlation: 1.0
527: FEAT_1: 6aeafed4 FEAT_2: Bubble Bath_4090 - Correlation: 1.0
528: FEAT_1: 8af75982 FEAT_2: Happy Camel_4020 - Correlation: 1.0
529: FEAT_1: 3110 FEAT_2: 3010 - Correlation: 0.9998806305811936
530: FEAT_1: 3120 FEAT_2: 3020 - Correlation: 0.9998524456510475
531: FEAT_1: 3121 FEAT_2: 3021 - 

In [22]:
y = train_en['accuracy_group']
X = train_en.drop(columns=(to_remove + ['accuracy_group', 'installation_id']), axis=1)
test_predict = test_en.drop(columns=(to_remove + ['accuracy_group', 'installation_id']), axis=1)

In [25]:
def NN_model(X_train, y_train, X_val, y_val):
    
    model = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(200, activation='relu'),
        tf.keras.layers.LayerNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.LayerNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(50, activation='relu'),
        tf.keras.layers.LayerNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(25, activation='relu'),
        tf.keras.layers.LayerNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation='relu')
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=4e-4),
                  loss='mse')
    print(model.summary())
    save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True,
                                                      save_best_only=True, verbose=1)
    early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
    model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=100,
              callbacks=[save_best, early_stop],
              verbose=100)
    return model

In [26]:
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=46)
models = []
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    model = NN_model(X_train, y_train, X_val, y_val)
    models.append(model)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 200)               85200     
_________________________________________________________________
layer_normalization_4 (Layer (None, 200)               400       
_________________________________________________________________
dropout_4 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               20100     
_________________________________________________________________
layer_normalization_5 (Layer (None, 100)               200       
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 50)               

_________________________________________________________________
layer_normalization_10 (Laye (None, 50)                100       
_________________________________________________________________
dropout_10 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 25)                1275      
_________________________________________________________________
layer_normalization_11 (Laye (None, 25)                50        
_________________________________________________________________
dropout_11 (Dropout)         (None, 25)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 26        
Total params: 112,401
Trainable params: 112,401
Non-trainable params: 0
_________________________________________________________________
None
Train on 16951 samples, validate on 4239 samples
Epoch 1/100

Epo

Epoch 00009: val_loss improved from 1.55995 to 1.55897, saving model to nn_model.w8
Epoch 10/100

Epoch 00010: val_loss improved from 1.55897 to 1.53430, saving model to nn_model.w8
Epoch 11/100

Epoch 00011: val_loss did not improve from 1.53430
Epoch 12/100

Epoch 00012: val_loss improved from 1.53430 to 1.52917, saving model to nn_model.w8
Epoch 13/100

Epoch 00013: val_loss improved from 1.52917 to 1.51918, saving model to nn_model.w8
Epoch 14/100

Epoch 00014: val_loss improved from 1.51918 to 1.51757, saving model to nn_model.w8
Epoch 15/100

Epoch 00015: val_loss did not improve from 1.51757
Epoch 16/100

Epoch 00016: val_loss improved from 1.51757 to 1.51241, saving model to nn_model.w8
Epoch 17/100

Epoch 00017: val_loss did not improve from 1.51241
Epoch 18/100

Epoch 00018: val_loss did not improve from 1.51241
Epoch 19/100

Epoch 00019: val_loss did not improve from 1.51241
Epoch 20/100

Epoch 00020: val_loss improved from 1.51241 to 1.49350, saving model to nn_model.w8
Epo

KeyboardInterrupt: 

In [None]:
pred_ensemble = np.zeros((test_predict.shape[0], 1))
for model in models:
    pred = model.predict(test_predict, verbose=0)
    pred_ensemble += pred
final_pred = pred_ensemble / len(models)

In [None]:
dist = Counter(train_en['accuracy_group'])
for k in dist:
    dist[k] /= len(train_en)
train_en['accuracy_group'].hist()

acum = 0
bound = {}
for i in range(3):
    acum += dist[i]
    bound[i] = np.percentile(final_pred, acum * 100)
print(bound)

def classify(x):
    if x <= bound[0]:
        return 0
    elif x <= bound[1]:
        return 1
    elif x <= bound[2]:
        return 2
    else:
        return 3
    
final_pred = np.array(list(map(classify, final_pred)))

sample_submission['accuracy_group'] = final_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission['accuracy_group'].value_counts(normalize=True)