# Altegrad Experiments

# CODE TO TRAIN ONE TARGET

In [1]:
import sys
import json
import numpy as np

from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input, Embedding, Dropout, Bidirectional, GRU, CuDNNGRU, TimeDistributed, Dense

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

tgt = 1

is_GPU = True
save_hist = False
save_weights = True
path_root = os.path.dirname(os.path.abspath('').replace('\\', '/'))
path_to_data = path_root + '/data/'
path_to_code = path_root + '/code/experiments/target_' + str(tgt) + '/'
sys.path.insert(0, path_to_code)

# = = = = = = = = = = = = = = =

from AttentionWithContext import AttentionWithContext
from make_model_tgt1 import make_model

# = = = = = hyper-parameters = = = = =


n_units = 60
drop_rate = 0.1
batch_size = 96
nb_epochs = 60
my_optimizer = 'adam'
my_patience = 6

# = = = = = data loading = = = = =

docs = np.load(path_to_data + 'documents.npy')
embeddings = np.load(path_to_data + 'embeddings.npy')

with open(path_to_data + 'train_idxs.txt', 'r') as file:
    train_idxs = file.read().splitlines()
    
train_idxs = [int(elt) for elt in train_idxs]

idxs_select_train = np.random.choice(range(len(train_idxs)), size=int(len(train_idxs) * 0.80), replace=False)
idxs_select_val = np.setdiff1d(range(len(train_idxs)), idxs_select_train)

train_idxs_new = [train_idxs[elt] for elt in idxs_select_train]
val_idxs = [train_idxs[elt] for elt in idxs_select_val]

docs_train = docs[train_idxs_new, :, :]
docs_val = docs[val_idxs, :, :]


with open(path_to_data + 'targets/train/target_' + str(tgt) + '.txt', 'r') as file:
    target = file.read().splitlines()
    
target_train = np.array([target[elt] for elt in idxs_select_train]).astype('float')
target_val = np.array([target[elt] for elt in idxs_select_val]).astype('float')

model = make_model(n_units, drop_rate, embeddings, docs_train, is_GPU)
print(model.summary())
 
model.compile(loss='mean_squared_error',
                  optimizer=my_optimizer,
                  metrics=['mse'])

# = = = = = training = = = = =

early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=my_patience,
                                   mode='min')

# save model corresponding to best epoch
checkpointer = ModelCheckpoint(filepath=path_to_data + 'model_' + str(tgt), 
                                   verbose=1, 
                                   save_best_only=True,
                                   save_weights_only=True)

if save_weights:
    my_callbacks = [early_stopping, checkpointer]
else:
    my_callbacks = [early_stopping]

model.fit(docs_train, 
              target_train,
              batch_size = batch_size,
              epochs = nb_epochs,
              validation_data = (docs_val, target_val),
              callbacks = my_callbacks)
    

if save_hist:
    hist = model.history.history
    with open(path_to_data + 'model_history_' + str(tgt) + '.json', 'w') as file:
        json.dump(hist, file, sort_keys=False, indent=4)

print('* * * * * * * target', tgt, 'done * * * * * * *')    

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 70, 11)            0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 70, 60)            21972855  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 70, 60)            43920     
_________________________________________________________________
attention_with_context_2 (At (None, 60)                3720      
_________________________________________________________________
dropout_2 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 60)                3660      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 61        
Total para


Epoch 00024: val_loss did not improve from 0.07509
Epoch 25/60

Epoch 00025: val_loss improved from 0.07509 to 0.07385, saving model to D:/Scolaire/Code/Python/Machine_Learning/Kaggle_Challenges_M2/HAN_Graphs/data/model_1
Epoch 26/60

Epoch 00026: val_loss did not improve from 0.07385
Epoch 27/60

Epoch 00027: val_loss did not improve from 0.07385
Epoch 28/60

Epoch 00028: val_loss did not improve from 0.07385
Epoch 29/60

Epoch 00029: val_loss improved from 0.07385 to 0.07266, saving model to D:/Scolaire/Code/Python/Machine_Learning/Kaggle_Challenges_M2/HAN_Graphs/data/model_1
Epoch 30/60

Epoch 00030: val_loss improved from 0.07266 to 0.07010, saving model to D:/Scolaire/Code/Python/Machine_Learning/Kaggle_Challenges_M2/HAN_Graphs/data/model_1
Epoch 31/60

Epoch 00031: val_loss did not improve from 0.07010
Epoch 32/60

Epoch 00032: val_loss improved from 0.07010 to 0.06838, saving model to D:/Scolaire/Code/Python/Machine_Learning/Kaggle_Challenges_M2/HAN_Graphs/data/model_1
Epoch 33

# CODE TO PREDICT ONE TARGET

In [3]:
import sys
import json
import numpy as np
import pandas as pd

from keras.models import Model
from keras.layers import Input, Embedding, Dropout, Bidirectional, GRU, CuDNNGRU, TimeDistributed, Dense

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# = = = = = = = = = = = = = = =

tgt = 1

is_GPU = True
save_hist = False
path_root = os.path.dirname(os.path.abspath('').replace('\\', '/'))
path_to_data = path_root + '/data/'
path_to_code = path_root + '/code/experiments/target_' + str(tgt) + '/'
sys.path.insert(0, path_to_code)

# = = = = = = = = = = = = = = =

from AttentionWithContext import AttentionWithContext
# to change 
from make_model_tgt1 import make_model
# = = = = = = = = = = = = = = =

docs = np.load(path_to_data + 'documents.npy')
embeddings = np.load(path_to_data + 'embeddings.npy')

with open(path_to_data + 'test_idxs.txt', 'r') as file:
    test_idxs = file.read().splitlines()


test_idxs = [int(elt) for elt in test_idxs]
docs_test = docs[test_idxs,:,:]

all_preds_han = []

indx_tg0 = [i for i in range(0, 18744)]
indx_tg1 = [i for i in range(18744, 37488)]
indx_tg2 = [i for i in range(37488, 56232)]
indx_tg3 = [i for i in range(56232,74976)]

idx = [indx_tg0, indx_tg1, indx_tg2, indx_tg3]

# * * * HAN * * * 
    
# relevant hyper-parameters
n_units = 60
drop_rate = 0 # prediction mode
 
model = make_model(n_units, drop_rate, embeddings, docs_test, is_GPU)
    
model.load_weights(path_to_data + 'model_' + str(tgt))
all_preds_han.append(model.predict(docs_test).tolist())

# flatten
all_preds_han = [elt[0] for sublist in all_preds_han for elt in sublist]

# write the predictions of a single target with the corrects indx
with open(path_to_data + 'predictions_han_' + str(tgt) + '.txt', 'w') as file:
    for idx, pred in zip(idx[tgt], all_preds_han):
        if tgt==0:
            file.write('id,pred\n')
        pred = format(pred, '.7f')
        file.write(str(idx) + ',' + pred + '\n')
        


# Concatenate predictions of all targets

In [6]:
import pandas as pd
import os

def from_txt_to_csv(file_name, folder_name):
    """
    Transform the output of the read_results_predict.py in a proper Kaggle Submission, i.e : a well formated csv file
    
    inputs : 
    - file_name is the name (string) of the txt file generated by the read_results_predict.py (without the .txt extension)
    - folder_name is the desired or existing name (string) of the folder where the submission will be stored into 
    
    output : None
    
    """
    # change me !
    path_root = os.path.dirname(os.path.abspath('').replace('\\', '/')) + "/data/"
    path_file = path_root  + file_name + ".txt"
    path_folder = path_root + folder_name
    # create a directory if it doesn't exist yet
    if not os.path.exists(path_folder):
        os.makedirs(path_folder)
      
    df = pd.read_csv(path_file)
    df.to_csv(path_folder + "/" + file_name + ".csv", index=False)
    print("Submission saved in '{}'".format(path_folder))
    

In [8]:
# list of txt files containing predictions for each target
filenames = [path_to_data + 'predictions_han_' + str(tgt) + '.txt' for tgt in range(4)]
with open(path_to_data + 'predictions_all.txt', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

from_txt_to_csv('predictions_all', '')

Submission saved in 'D:/Scolaire/Code/Python/Machine_Learning/Kaggle_Challenges_M2/HAN_Graphs/data/'
