# Projet AIOI

https://www.kaggle.com/c/stanford-covid-vaccine

## Les imports

In [2]:
%%capture

import numpy as np
import pandas as pd
from collections import Counter
from keras import models
from keras import layers
from keras import optimizers
from keras.utils.vis_utils import plot_model
from keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt
from sklearn import model_selection
import tensorflow as tf

# Modules
from aioi.data import data
from aioi.files.read_file import read_json, read_npy
import aioi.graphique.plot as plot
from aioi.models import models as mdl

## Charger les données

In [3]:
arn_train = read_json('./Data/train.json')
arn_test = read_json('./Data/test.json')

In [4]:
arn_train.columns, arn_test.columns

(Index(['index', 'id', 'sequence', 'structure', 'predicted_loop_type',
        'signal_to_noise', 'SN_filter', 'seq_length', 'seq_scored',
        'reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10',
        'deg_error_Mg_50C', 'deg_error_50C', 'reactivity', 'deg_Mg_pH10',
        'deg_pH10', 'deg_Mg_50C', 'deg_50C'],
       dtype='object'),
 Index(['index', 'id', 'sequence', 'structure', 'predicted_loop_type',
        'seq_length', 'seq_scored'],
       dtype='object'))

## Les données

In [5]:
arn_train.shape, arn_train['id'].nunique(), arn_test.shape

((2400, 19), 2400, (3634, 7))

<div class="alert alert-block alert-warning">
    Nettoyer les données, i.e ne prendre en compte que les ARN qui ont un <bold>SN_filter de 1<bold>
</div>

In [6]:
arn_train = arn_train.query('SN_filter == 1')
arn_train.shape

(1589, 19)

In [7]:
arn_train.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...","[0.26130000000000003, 0.38420000000000004, 0.1...","[0.2631, 0.28600000000000003, 0.0964, 0.1574, ...","[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000..."
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.8,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...","[0.1365, 0.2237, 0.1812, 0.1333, 0.1148, 0.160...","[0.17020000000000002, 0.178, 0.111, 0.091, 0.0...","[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499..."
5,5,id_00ab2d761,GGAAAGCGCCGCGGCGGUAGCGGCAGCGAGGAGCGCUACCAAGGCA...,.....(.(((((.(((((((((...........)))))))..(((....,EEEEESISSSSSISSSSSSSSSHHHHHHHHHHHSSSSSSSMMSSSH...,4.136,1,107,68,"[0.1942, 0.2041, 0.1626, 0.1213, 0.10590000000...","[0.2726, 0.2984, 0.21660000000000001, 0.1637, ...","[0.3393, 0.2728, 0.2005, 0.1703, 0.1495, 0.134...","[0.165, 0.20520000000000002, 0.179, 0.1333, 0....","[0.2864, 0.24710000000000001, 0.2222, 0.1903, ...","[0.7642, 1.6641, 1.0622, 0.5008, 0.4107, 0.133...","[0.9559000000000001, 1.9442, 1.0114, 0.5105000...","[1.9554, 2.1298, 1.0403, 0.609, 0.5486, 0.386,...","[0.22460000000000002, 1.7281, 1.381, 0.6623, 0...","[0.5882000000000001, 1.1786, 0.9704, 0.6035, 0..."
6,6,id_00abef1d7,GGAAAACAAUUGCAUCGUUAGUACGACUCCACAGCGUAAGCUGUGG...,.........((((((((......((((((((((((....)))))))...,EEEEEEEEESSSSSSSSIIIIIISSSSSSSSSSSSHHHHSSSSSSS...,2.485,1,107,68,"[0.422, 0.5478000000000001, 0.4749000000000000...","[0.4801, 0.7943, 0.42160000000000003, 0.397300...","[0.9822000000000001, 1.272, 0.6940000000000001...","[0.5827, 0.7555000000000001, 0.5949, 0.4511, 0...","[0.9306000000000001, 1.0496, 0.5844, 0.7796000...","[0.895, 2.3377, 2.2305, 2.003, 1.9006, 1.0373,...","[0.46040000000000003, 3.6695, 0.78550000000000...","[2.7711, 7.365, 1.6924000000000001, 1.43840000...","[1.073, 2.8604000000000003, 1.9936, 1.0273, 1....","[2.0964, 3.3688000000000002, 0.6399, 2.1053, 1..."
7,7,id_00b436dec,GGAAAUCAUCGAGGACGGGUCCGUUCAGCACGCGAAAGCGUCGUGA...,.....(((((((((((..(((((((((..((((....))))..)))...,EEEEESSSSSSSSSSSIISSSSSSSSSIISSSSHHHHSSSSIISSS...,1.727,1,107,68,"[0.4843, 0.5233, 0.4554, 0.43520000000000003, ...","[0.8719, 1.0307, 0.6649, 0.34500000000000003, ...","[0.7045, 0.7775000000000001, 0.5662, 0.4561, 0...","[0.384, 0.723, 0.4766, 0.30260000000000004, 0....","[0.7429, 0.9137000000000001, 0.480400000000000...","[1.1576, 1.5137, 1.3382, 1.5622, 1.2121, 0.295...","[1.6912, 5.2652, 2.3901, 0.45890000000000003, ...","[1.8641, 2.3767, 1.149, 1.0132, 0.9876, 0.0, 0...","[0.49060000000000004, 4.6339, 1.95860000000000...","[1.2852000000000001, 2.5460000000000003, 0.234..."


In [8]:
arn_test.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,seq_length,seq_scored
0,0,id_00073f8be,GGAAAAGUACGACUUGAGUACGGAAAACGUACCAACUCGAUUAAAA...,......((((((((((.(((((.....))))))))((((((((......,EEEEEESSSSSSSSSSBSSSSSHHHHHSSSSSSSSSSSSSSSSHHH...,107,68
1,1,id_000ae4237,GGAAACGGGUUCCGCGGAUUGCUGCUAAUAAGAGUAAUCUCUAAAU...,.....((((..((((((...(((((.....((((....)))).......,EEEEESSSSIISSSSSSIIISSSSSIIIIISSSSHHHHSSSSIIII...,130,91
2,2,id_00131c573,GGAAAACAAAACGGCCUGGAAGACGAAGGAAUUCGGCGCGAAGGCC...,...........((.(((.(.(..((..((..((((...))))..))...,EEEEEEEEEEESSISSSISISIISSIISSIISSSSHHHSSSSIISS...,107,68
3,3,id_00181fd34,GGAAAGGAUCUCUAUCGAAGGAUAGAGAUCGCUCGCGACGGCACGA...,......((((((((((....))))))))))((((((..((.(((.....,EEEEEESSSSSSSSSSHHHHSSSSSSSSSSSSSSSSIISSISSSHH...,107,68
4,4,id_0020473f7,GGAAACCCGCCCGCGCCCGCCCGCGCUGCUGCCGUGCCUCCUCUCC...,.....(((((((((((((((((((((((((((((((((((((((((...,EEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS...,130,91


In [9]:
Counter(arn_train['seq_length']), Counter(arn_test['seq_length']), Counter(arn_test['seq_scored'])

(Counter({107: 1589}),
 Counter({107: 629, 130: 3005}),
 Counter({68: 629, 91: 3005}))

### Input x

In [10]:
col_names = ['sequence', 'structure', 'predicted_loop_type']
X_train = arn_train[col_names]
X_train.shape

(1589, 3)

> One hot encoding des séquences

In [11]:
x_seq = data.x_sequence(X_train['sequence'])
x_seq.shape

(1589, 107, 4)

> One hot encoding des structures

In [12]:
x_struc = data.x_structure(X_train['structure'])
x_struc.shape

(1589, 107, 3)

> One hot encoding des loops type

In [13]:
x_loops = data.x_predicted_loops(X_train['predicted_loop_type'])
x_loops.shape

(1589, 107, 7)

> Regroupement dans une seul matrice de dimension (2400, 107, 14)

Avec

- 2400: le nombre d'ARN
- 107: la taille des séquences
- 14: le nombre de classes (4+3+7)

In [14]:
x_train = data.x_concatenation(x_seq, x_struc, x_loops)
x_train.shape

(1589, 107, 14)

### Output y

In [15]:
col_names = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
Y_output = arn_train[col_names]
Y_output.shape

(1589, 5)

> Formatage Y_output

In [16]:
y_train = data.y_output(Y_output)
y_train.shape

Y_train shape: (1589, 68, 5)



(1589, 68, 5)

# Modèle keras 

In [81]:
def resize_shape_data(model):
    model = layers.Conv1D(
        filters=5, kernel_size=(20,), activation="relu")(model)
    model = layers.Conv1D(
        filters=5, kernel_size=(20,), activation="relu")(model)
    model = layers.Conv1D(
        filters=5, kernel_size=(2,), activation="relu")(model)

    return model

def model_simple(input_shape, learning_rate):
    """
    Définition d'un 1er modèle simple.
    """
    # Convolution layers
    input_ = layers.Input(shape = input_shape)

    conv_1 = layers.Conv1D(
        filters=45, kernel_size=(3,), activation="relu", padding="same")(input_)
    conv_2 = layers.Conv1D(
        filters=35, kernel_size=(3,), activation="relu", padding="same")(conv_1)
    conv_3 = layers.Conv1D(
        filters=25, kernel_size=(3,), activation="relu", padding="same")(conv_2)
    conv_4 = layers.Conv1D(
        filters=15, kernel_size=(3,), activation="relu", padding="same")(conv_3)

    # Output layer - resize shape of the data to fit with the ouput (68,5)
    output = resize_shape_data(conv_4)
    model = models.Model(inputs=input_, outputs=output)

    # Compilation
    opt = optimizers.SGD(lr=learning_rate)
    model.compile(loss='mse', optimizer=opt, metrics=['mse'])

    return model

# Modèle apprentissage

In [82]:
model = model_simple((107, 14), 1E-2)
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 107, 14)           0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 107, 45)           1935      
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 107, 35)           4760      
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 107, 25)           2650      
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 107, 15)           1140      
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 88, 5)             1505      
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 69, 5)             505 

In [83]:
fit_out = model.fit(x=x_train, y=y_train, epochs=5, batch_size=10, verbose=1, validation_split=0.2)

Train on 1271 samples, validate on 318 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Save + load loss & mse dans un fichier npy

In [112]:
dico = {'model_simple': fit_out.history}

In [108]:
np.save('history-test.npy', dico)

In [119]:
tmp = np.load('./Models/models_history.npy', allow_pickle=True).item()

## Save + load model dans un fichier h5

In [84]:
model.save("test.h5")

In [115]:
model_load = models.load_model("./Models/Keras_models/model_simple.h5")

# Repeated kfold validation

In [63]:
x_train.shape, y_train.shape

((1589, 107, 14), (1589, 68, 5))

In [59]:
def repeated_kfold_validation(X, Y):
    """
    Réalise une k-fold Cross-validation pour évaluer les différents modèles.

    Une étape d'optimisation des modèles a été réalisée au préalable.
    """
    # Shape des data en entrée
    input_shape = (107,14)

    # Liste des modèles utilisés
    list_model = [model_simple]

    # Learing rate optimisé pour chaque modèle
    learning_rate = [1E-2]

    scores = {}

    for index, model in enumerate(list_model):
        print("\n#####\n{}\n#####".format(model.__name__))
        #time.sleep(30)

        scores[model.__name__] = {'Loss': [], 'Mse': []}
        # kfold = model_selection.StratifiedKFold(n_splits=10)
        
        for _ in range(2):
            x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=2)
            
            mdl = model(input_shape, learning_rate[index])
            fit = mdl.fit(x=x_train, y=y_train, epochs=15,
                          batch_size=10, verbose=0)

            loss, mse = mdl.evaluate(x_test, y_test, verbose=0)  # Return the loss value & metrics values
            
            scores[model.__name__]['Loss'].append(loss)
            scores[model.__name__]['Mse'].append(mse)

    return scores

scores = repeated_kfold_validation(x_train, y_train)


#####
model_simple
#####


In [60]:
scores

{'model_simple': {'Loss': [0.1870537687959911, 0.3147180190376266],
  'Mse': [0.18705378472805023, 0.31471800804138184]}}