# Altegrad Experiments

### Vincent Jacob, Cyril Equilbec

## Imports

In [1]:
import sys
import json
import numpy as np
import pandas as pd
from keras.optimizers import SGD, Nadam, Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

Using TensorFlow backend.


## Global Variables

In [2]:
# target we want to train
tgt = 2
is_GPU = True
save_hist = False
save_weights = True

## Data Loading

In [3]:
path_root = os.path.dirname(os.path.abspath('').replace('\\', '/'))
path_to_data = path_root + '/data/'
path_to_code = path_root + '/code/experiments/target_' + str(tgt) + '/'
sys.path.insert(0, path_to_code)

# precomputed documents
docs = np.load(path_to_data + 'documents.npy')
# precomputed embbeding matrix
embeddings = np.load(path_to_data + 'embeddings.npy')


# Load training set and split it into train - validation sets
with open(path_to_data + 'train_idxs.txt', 'r') as file:
    train_idxs = file.read().splitlines()
    
train_idxs = [int(elt) for elt in train_idxs]

idxs_select_train = np.random.choice(range(len(train_idxs)), size=int(len(train_idxs) * 0.90), replace=False)
idxs_select_val = np.setdiff1d(range(len(train_idxs)), idxs_select_train)

train_idxs_new = [train_idxs[elt] for elt in idxs_select_train]
val_idxs = [train_idxs[elt] for elt in idxs_select_val]

docs_train = docs[train_idxs_new, :, :]
docs_val = docs[val_idxs, :, :]

# Load training and validation targets
with open(path_to_data + 'targets/train/target_' + str(tgt) + '.txt', 'r') as file:
    target = file.read().splitlines()
    
target_train = np.array([target[elt] for elt in idxs_select_train]).astype('float')
target_val = np.array([target[elt] for elt in idxs_select_val]).astype('float')


# Load test set
with open(path_to_data + 'test_idxs.txt', 'r') as file:
    test_idxs = file.read().splitlines()

test_idxs = [int(elt) for elt in test_idxs]
docs_test = docs[test_idxs, :, :]

# CODE TO TRAIN ONE TARGET

## hyper-parameters

In [7]:
# number of GRU units
n_units = 45
# mode by which outputs of the forward and backward RNNs will be combined.
merge_mode = 'concat'
drop_rate = 0.5 
# droupout rate after embedding layer
drop_rate_emb = 0.1
# whether to use cosine sim or not (unormalized dot product)
att_cosine = True
# whether to use a MLP for computing hidden attention state
use_dense_layer = False
# the activation function used by the MLP 
att_activation = None
batch_size = 120
nb_epochs = 120
my_optimizer = Nadam()
my_patience = 12

###############################
# Training from scratch or not
pretrained = True
###############################

## Callbacks

In [5]:
# stop the training if the loss doesn't improve up to 'my_patience' epochs
early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=my_patience,
                                   mode='min')

# save model corresponding to best epoch
checkpointer = ModelCheckpoint(filepath=path_to_data + 'model_' + str(tgt), 
                                   verbose=1, 
                                   save_best_only=True,
                                   save_weights_only=True)

# reduce learning rate by 20% on plateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

In [8]:
# Load the correct attention mechanism and architecture according to target
from make_model_tgt2 import make_model

# Build a model according to the above hyperparameters
model = make_model(n_units, merge_mode, drop_rate, drop_rate_emb, 
                   att_cosine, att_activation, use_dense_layer, 
                   embeddings, docs_train, is_GPU)

print(model.summary())

if pretrained:
    model.load_weights(path_to_data + 'model_' + str(tgt))
    print("Weights loaded")
    

model.compile(loss='mean_squared_error',
                  optimizer=my_optimizer,
                  metrics=['mse'])

# = = = = = training = = = = =

if save_weights:
    my_callbacks = [checkpointer, reduce_lr, early_stopping]
else:
    my_callbacks = [reduce_lr, early_stopping]
    
    

model.fit(docs_train, 
              target_train,
              batch_size = batch_size,
              epochs = nb_epochs,
              validation_data = (docs_val, target_val),
              callbacks = my_callbacks)
    

    
if save_hist:
    hist = model.history.history
    with open(path_to_data + 'model_history_' + str(tgt) + '.json', 'w') as file:
        json.dump(hist, file, sort_keys=False, indent=4)

print('* * * * * * * target', tgt, 'done * * * * * * *')    

TypeError: make_model() takes 9 positional arguments but 10 were given

best : 0.31090

# CODE TO PREDICT ONE TARGET

In [None]:
all_preds_han = []

indx_tg0 = [i for i in range(0, 18744)]
indx_tg1 = [i for i in range(18744, 37488)]
indx_tg2 = [i for i in range(37488, 56232)]
indx_tg3 = [i for i in range(56232,74976)]

idx = [indx_tg0, indx_tg1, indx_tg2, indx_tg3]

# * * * HAN * * * 
    
# relevant hyper-parameters
n_units = 45
drop_rate = 0 # prediction mode
 
model = make_model(n_units, drop_rate, embeddings, docs_test, is_GPU)
    
model.load_weights(path_to_data + 'model_' + str(tgt))
all_preds_han.append(model.predict(docs_test).tolist())

# flatten
all_preds_han = [elt[0] for sublist in all_preds_han for elt in sublist]

# write the predictions of a single target with the corrects indx
with open(path_to_data + 'predictions_han_' + str(tgt) + '.txt', 'w') as file:
    if tgt==0:
        file.write('id,pred\n')
    for idx, pred in zip(idx[tgt], all_preds_han):
        pred = format(pred, '.7f')
        file.write(str(idx) + ',' + pred + '\n')

# Concatenate predictions of all targets

In [None]:
def from_txt_to_csv(file_name, folder_name):
    """
    Transform the output of the read_results_predict.py in a proper Kaggle Submission, i.e : a well formated csv file
    
    inputs : 
    - file_name is the name (string) of the txt file generated by the read_results_predict.py (without the .txt extension)
    - folder_name is the desired or existing name (string) of the folder where the submission will be stored into 
    
    output : None
    
    """
    path_root = os.path.dirname(os.path.abspath('').replace('\\', '/')) + "/data/"
    path_file = path_root  + file_name + ".txt"
    path_folder = path_root + folder_name
    # create a directory if it doesn't exist yet
    if not os.path.exists(path_folder):
        os.makedirs(path_folder)
      
    df = pd.read_csv(path_file)
    df.to_csv(path_folder + "/" + file_name + ".csv", index=False)
    print("Submission saved in '{}'".format(path_folder))  

In [None]:
# list of txt files containing predictions for each target
filenames = [path_to_data + 'predictions_han_' + str(tgt) + '.txt' for tgt in range(4)]
with open(path_to_data + 'predictions_all.txt', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

from_txt_to_csv('predictions_all', '')