# Hackathon

Some utilities

## Import Utils

In [None]:
!pip install keras
!pip install tables

In [1]:
import keras
from keras.metrics import top_k_categorical_accuracy
import h5py as h5
import numpy as np
import pandas as pd
import tables

PATH_DATA = '/notebooks/data/balanced_7classes.h5'
PATH_PREDICT_WITHOUT_GT = '/notebooks/data/pred_students/pred_from_half/pred_eighties_from_half_1_without_gt.h5'
PATH_SUBMIT = '/notebooks/ChallengeHacka/pred_eighties_from_half_1_gr6.csv'
# PATH_PREDICT_WITH_GT = '/pred_teachers/pred_eighties_from_half_1.h5'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
BATCH_SIZE = 32
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dropout
import keras.layers.normalization 
from keras.callbacks import Callback
%load_ext autoreload
%autoreload 2

In [3]:
def get_idxs(h5_path):
    f = h5.File(h5_path)
    return range(len(f['S2']))

def shuffle_idx(sample_idxs):
    return list(np.random.permutation(sample_idxs))

def split_train_val(sample_idxs, proportion):
    n_samples = len(sample_idxs)
    return sample_idxs[:int((1.-proportion)*n_samples)], sample_idxs[int((1.-proportion)*n_samples):]

def get_batch_count(idxs, batch_size):
    batch_count = int(len(idxs)//batch_size)
    remained_samples = len(idxs)%batch_size
    if remained_samples > 0:
        batch_count += 1

    return batch_count

In [4]:
def generator(h5_path, batch_size, idxs):
    f = h5.File(h5_path, 'r')
    while True : 
        idxs = shuffle_idx(idxs)
        batch_count = get_batch_count(idxs, batch_size)
        for b in range(batch_count):
            batch_idx = idxs[b*batch_size]
            X = f['S2'][batch_idx:batch_idx+batch_size, :,:,:]
            Y = f['TOP_LANDCOVER'][batch_idx:batch_idx+batch_size, :]
            for i in range(len(Y)) :
                a = Y[i]
                if a < 4:
                    Y[i] -= 1
                elif a == 5:
                    Y[i] = 3
                elif a == 10:
                    Y[i] = 4
                elif a == 12:
                    Y[i] = 5
                elif a == 19:
                    Y[i] = 6
                else : 
                    print('value class out of range : %d ' %a)
                            
            yield np.array(X), keras.utils.np_utils.to_categorical(np.array(Y), 7)

In [5]:
idxs = get_idxs(PATH_DATA)
shuffled_idxs = shuffle_idx(idxs)
train_idxs, val_idxs = split_train_val(shuffled_idxs, 0.2)

print(len(idxs))

2100000


In [6]:
train_gen = generator(PATH_DATA, BATCH_SIZE, train_idxs)
train_batch_count = get_batch_count(train_idxs, BATCH_SIZE)

val_gen = generator(PATH_DATA, BATCH_SIZE, val_idxs)
val_batch_count = get_batch_count(val_idxs, BATCH_SIZE)

In [7]:
print(train_batch_count, val_batch_count)

52500 13125


In [8]:
label = h5.File(PATH_DATA)['TOP_LANDCOVER']
images = h5.File(PATH_DATA)['S2']

In [10]:
unique, counts = np.unique(label, return_counts=True)
# panda_prediction.describe()
print(unique)
print(counts)

[ 1.  2.  3.  5. 10. 12. 19.]
[300000 300000 300000 300000 300000 300000 300000]


In [11]:
def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

def top_5_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=5)

# Instanciation du model

In [12]:
input_shape = (16,16,4)

dropout_rate=0.5

model = Sequential()

model.add(Conv2D(32, (3,3), padding='same', activation='relu', input_shape=input_shape))
model.add(BatchNormalization())
model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(7, activation='softmax'))

# Fit

In [None]:
# optim = keras.optimizers.Adam(lr=0.001)
optim = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)

model.compile(optimizer=optim,
              loss='categorical_crossentropy',
              metrics=['accuracy', top_3_accuracy ])

history = model.fit_generator(train_gen, steps_per_epoch=train_batch_count, epochs=3, verbose=1, validation_data=val_gen, nb_val_samples=1000)

  
  


Epoch 1/3
 8403/52500 [===>..........................] - ETA: 33:54 - loss: 1.4880 - acc: 0.4169 - top_3_accuracy: 0.7745

## Prediction routines

In order to submit a result here are some gits

In [None]:
import os 
def prediction_generator(h5_path, batch_size, idxs):
    f = h5.File(h5_path, 'r')

    batch_count = get_batch_count(idxs, batch_size)
    
    for b in range(batch_count):
        batch_idxs = idxs[b*batch_size:(b+1)*batch_size]
        batch_idxs = sorted(batch_idxs)
        X = f['S2'][batch_idxs, :,:,:]
        yield np.array(X)

In [None]:
prediction = model.predict_generator(val_gen, steps=val_batch_count, verbose=1)
print(len(prediction))
#to dataframe to csv index = ID colonne TOP_LANDCOVER

In [None]:
pred_idx = get_idxs(PATH_PREDICT_WITHOUT_GT)
print(len(pred_idx))
pred_gen = prediction_generator(PATH_PREDICT_WITHOUT_GT, BATCH_SIZE, pred_idx)
prediction = model.predict_generator(pred_gen, steps=get_batch_count(pred_idx, BATCH_SIZE), verbose=1)
print(len(prediction))

preds = np.argmax(prediction, axis = 1)

for a in range(12) :
    indexes= np.where(preds == a)
    if a < 5 :
        preds[indexes] = a + 1
    elif 4 < a < 8:
        preds[indexes] =  (a-5)
    elif a == 8:
        preds[indexes] = 14
    elif a == 9:
        preds[indexes] = 17
    elif a == 10:
        preds[indexes] = 19
    elif a == 11:
        preds[indexes] = 20
    
panda_prediction = pd.DataFrame(np.argmax(prediction, axis = 1))

panda_prediction

panda_prediction.columns = ['TOP_LANDCOVER']
panda_prediction.to_csv(PATH_SUBMIT, index_label = 'ID')

In [None]:
unique, counts = np.unique(np.argmax(prediction, axis = 1), return_counts=True)
# panda_prediction.describe()
print(unique)
print(counts)