# Projet AIOI

https://www.kaggle.com/c/stanford-covid-vaccine

## Les imports

In [167]:
%%capture

import csv
import numpy as np
import pandas as pd
from collections import Counter
from keras import models
from keras import layers
from keras import optimizers
from keras.utils.vis_utils import plot_model
from keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt
from sklearn import model_selection
import tensorflow as tf

# Modules
from aioi.data import data
from aioi.files import read_file as rf
import aioi.graphique.plot as plot
from aioi.models import models as mdl

## Charger les données

In [20]:
arn_train = rf.read_json('./Data/train.json')
arn_test = rf.read_json('./Data/test.json')

In [21]:
arn_train.columns, arn_test.columns

(Index(['index', 'id', 'sequence', 'structure', 'predicted_loop_type',
        'signal_to_noise', 'SN_filter', 'seq_length', 'seq_scored',
        'reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10',
        'deg_error_Mg_50C', 'deg_error_50C', 'reactivity', 'deg_Mg_pH10',
        'deg_pH10', 'deg_Mg_50C', 'deg_50C'],
       dtype='object'),
 Index(['index', 'id', 'sequence', 'structure', 'predicted_loop_type',
        'seq_length', 'seq_scored'],
       dtype='object'))

## Les données

In [22]:
arn_train.shape, arn_train['id'].nunique(), arn_test.shape

((2400, 19), 2400, (3634, 7))

<div class="alert alert-block alert-warning">
    Nettoyer les données, i.e ne prendre en compte que les ARN qui ont un <bold>SN_filter de 1<bold>
</div>

In [23]:
arn_train = arn_train.query('SN_filter == 1')
arn_train.shape

(1589, 19)

In [24]:
arn_train.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...","[0.26130000000000003, 0.38420000000000004, 0.1...","[0.2631, 0.28600000000000003, 0.0964, 0.1574, ...","[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000..."
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.8,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...","[0.1365, 0.2237, 0.1812, 0.1333, 0.1148, 0.160...","[0.17020000000000002, 0.178, 0.111, 0.091, 0.0...","[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499..."
5,5,id_00ab2d761,GGAAAGCGCCGCGGCGGUAGCGGCAGCGAGGAGCGCUACCAAGGCA...,.....(.(((((.(((((((((...........)))))))..(((....,EEEEESISSSSSISSSSSSSSSHHHHHHHHHHHSSSSSSSMMSSSH...,4.136,1,107,68,"[0.1942, 0.2041, 0.1626, 0.1213, 0.10590000000...","[0.2726, 0.2984, 0.21660000000000001, 0.1637, ...","[0.3393, 0.2728, 0.2005, 0.1703, 0.1495, 0.134...","[0.165, 0.20520000000000002, 0.179, 0.1333, 0....","[0.2864, 0.24710000000000001, 0.2222, 0.1903, ...","[0.7642, 1.6641, 1.0622, 0.5008, 0.4107, 0.133...","[0.9559000000000001, 1.9442, 1.0114, 0.5105000...","[1.9554, 2.1298, 1.0403, 0.609, 0.5486, 0.386,...","[0.22460000000000002, 1.7281, 1.381, 0.6623, 0...","[0.5882000000000001, 1.1786, 0.9704, 0.6035, 0..."
6,6,id_00abef1d7,GGAAAACAAUUGCAUCGUUAGUACGACUCCACAGCGUAAGCUGUGG...,.........((((((((......((((((((((((....)))))))...,EEEEEEEEESSSSSSSSIIIIIISSSSSSSSSSSSHHHHSSSSSSS...,2.485,1,107,68,"[0.422, 0.5478000000000001, 0.4749000000000000...","[0.4801, 0.7943, 0.42160000000000003, 0.397300...","[0.9822000000000001, 1.272, 0.6940000000000001...","[0.5827, 0.7555000000000001, 0.5949, 0.4511, 0...","[0.9306000000000001, 1.0496, 0.5844, 0.7796000...","[0.895, 2.3377, 2.2305, 2.003, 1.9006, 1.0373,...","[0.46040000000000003, 3.6695, 0.78550000000000...","[2.7711, 7.365, 1.6924000000000001, 1.43840000...","[1.073, 2.8604000000000003, 1.9936, 1.0273, 1....","[2.0964, 3.3688000000000002, 0.6399, 2.1053, 1..."
7,7,id_00b436dec,GGAAAUCAUCGAGGACGGGUCCGUUCAGCACGCGAAAGCGUCGUGA...,.....(((((((((((..(((((((((..((((....))))..)))...,EEEEESSSSSSSSSSSIISSSSSSSSSIISSSSHHHHSSSSIISSS...,1.727,1,107,68,"[0.4843, 0.5233, 0.4554, 0.43520000000000003, ...","[0.8719, 1.0307, 0.6649, 0.34500000000000003, ...","[0.7045, 0.7775000000000001, 0.5662, 0.4561, 0...","[0.384, 0.723, 0.4766, 0.30260000000000004, 0....","[0.7429, 0.9137000000000001, 0.480400000000000...","[1.1576, 1.5137, 1.3382, 1.5622, 1.2121, 0.295...","[1.6912, 5.2652, 2.3901, 0.45890000000000003, ...","[1.8641, 2.3767, 1.149, 1.0132, 0.9876, 0.0, 0...","[0.49060000000000004, 4.6339, 1.95860000000000...","[1.2852000000000001, 2.5460000000000003, 0.234..."


In [25]:
arn_test.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,seq_length,seq_scored
0,0,id_00073f8be,GGAAAAGUACGACUUGAGUACGGAAAACGUACCAACUCGAUUAAAA...,......((((((((((.(((((.....))))))))((((((((......,EEEEEESSSSSSSSSSBSSSSSHHHHHSSSSSSSSSSSSSSSSHHH...,107,68
1,1,id_000ae4237,GGAAACGGGUUCCGCGGAUUGCUGCUAAUAAGAGUAAUCUCUAAAU...,.....((((..((((((...(((((.....((((....)))).......,EEEEESSSSIISSSSSSIIISSSSSIIIIISSSSHHHHSSSSIIII...,130,91
2,2,id_00131c573,GGAAAACAAAACGGCCUGGAAGACGAAGGAAUUCGGCGCGAAGGCC...,...........((.(((.(.(..((..((..((((...))))..))...,EEEEEEEEEEESSISSSISISIISSIISSIISSSSHHHSSSSIISS...,107,68
3,3,id_00181fd34,GGAAAGGAUCUCUAUCGAAGGAUAGAGAUCGCUCGCGACGGCACGA...,......((((((((((....))))))))))((((((..((.(((.....,EEEEEESSSSSSSSSSHHHHSSSSSSSSSSSSSSSSIISSISSSHH...,107,68
4,4,id_0020473f7,GGAAACCCGCCCGCGCCCGCCCGCGCUGCUGCCGUGCCUCCUCUCC...,.....(((((((((((((((((((((((((((((((((((((((((...,EEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS...,130,91


In [26]:
Counter(arn_train['seq_length']), Counter(arn_test['seq_length']), Counter(arn_test['seq_scored'])

(Counter({107: 1589}),
 Counter({107: 629, 130: 3005}),
 Counter({68: 629, 91: 3005}))

Un traitement préalable des arn_train & arn_test va être réalisé :

> Séquence de taille 107

- Part1: seq[:68]
- Part2: seq[107-68:]

> Séquence de taille 130

- Part1: seq[:68]
- Part2: seq[130-68:]

Ce traitement permet d'obtenir des séquences de tailles 68 qui correspond à la taille minimum des séquence scored.

### Input x

In [27]:
arn_train.shape

(1589, 19)

In [38]:
train_data = data.formatage_x(arn_train)
train_data.shape

(3178, 6)

In [29]:
train_data.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,seq_length
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,107
1,0,,CUAGGUAACUGGAAUAACCCAUACCAGCAGUUAGAGUUCGCUCUAA...,((((....))))))..)).....))....(((((((....))))))...,SSSSHHHHSSSSSSIISSIIIIISSXXXXSSSSSSSHHHHSSSSSS...,107
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,107
3,2,,AUAGAAUCGAAAUAGCAUCGAUGUGUAUAUGGGUGGUUCGCCGCUC...,......))))..)))).....)).)))).(((((((....))))))...,HHHHHHSSSSMMSSSSIIIIISSISSSSXSSSSSSSHHHHSSSSSS...,107
4,5,id_00ab2d761,GGAAAGCGCCGCGGCGGUAGCGGCAGCGAGGAGCGCUACCAAGGCA...,.....(.(((((.(((((((((...........)))))))..(((....,EEEEESISSSSSISSSSSSSSSHHHHHHHHHHHSSSSSSSMMSSSH...,107


> One hot encoding des séquences

In [30]:
x_seq = data.x_sequence(train_data['sequence'])
x_seq.shape

(3178, 68, 4)

> One hot encoding des structures

In [31]:
x_struc = data.x_structure(train_data['structure'])
x_struc.shape

(3178, 68, 3)

> One hot encoding des loops type

In [32]:
x_loops = data.x_predicted_loops(train_data['predicted_loop_type'])
x_loops.shape

(3178, 68, 7)

> Regroupement dans une seul matrice de dimension (2400, 107, 14)

Avec

- 2400: le nombre d'ARN
- 68: la taille des séquences
- 14: le nombre de classes (4+3+7)

In [33]:
x_train = data.x_concatenation(x_seq, x_struc, x_loops)
x_train.shape

(3178, 68, 14)

### Output y

In [34]:
col_names = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
Y_output = arn_train[col_names]
Y_output.shape

(1589, 5)

> Formatage Y_output

In [36]:
y_train = data.y_output(Y_output)

Y_train shape: (3178, 68, 5)



# Modèle keras 

In [None]:
def resize_shape_data(model):
    model = layers.Conv1D(
        filters=5, kernel_size=(20,), activation="relu")(model)
    model = layers.Conv1D(
        filters=5, kernel_size=(20,), activation="relu")(model)
    model = layers.Conv1D(
        filters=5, kernel_size=(2,), activation="relu")(model)

    return model

def model_simple(input_shape, learning_rate):
    """
    Définition d'un 1er modèle simple.
    """
    # Convolution layers
    input_ = layers.Input(shape = input_shape)

    conv_1 = layers.Conv1D(
        filters=45, kernel_size=(3,), activation="relu", padding="same")(input_)
    conv_2 = layers.Conv1D(
        filters=35, kernel_size=(3,), activation="relu", padding="same")(conv_1)
    conv_3 = layers.Conv1D(
        filters=25, kernel_size=(3,), activation="relu", padding="same")(conv_2)
    conv_4 = layers.Conv1D(
        filters=15, kernel_size=(3,), activation="relu", padding="same")(conv_3)

    # Output layer - resize shape of the data to fit with the ouput (68,5)
    output = resize_shape_data(conv_4)
    model = models.Model(inputs=input_, outputs=output)

    # Compilation
    opt = optimizers.SGD(lr=learning_rate)
    model.compile(loss='mse', optimizer=opt, metrics=['mse'])

    return model

# Modèle apprentissage

# History

## Load history

In [None]:
history = rf.load_history()

## Plot loss & mse

In [None]:
for mdl in history:
    
    label = [('loss', 'val_loss'), ('mse', 'val_mse')]
    title = ['Model loss', 'Model mse']
    
    fig, axs = plt.subplots(2, 1, figsize=(12,10), constrained_layout=True)
    
    for i in range(2):
        axs[i].plot(history[mdl][label[i][0]], color="#8b1538", label=label[i][0])
        axs[i].plot(history[mdl][label[i][1]], color="#1F618D", label=label[i][1])
        axs[i].set_title(title[i], fontsize="x-large")
        
        # Ajouter la légende au dessus du plot sans changer sa taille
        axs[i].legend(loc=1, ncol=1, fontsize="x-large")
        
    fig.suptitle(mdl, fontsize="xx-large")
    break

## Save + load model dans un fichier h5

In [15]:
keras_models = rf.load_keras_models()

In [16]:
keras_models

{'model_resnet_15': <keras.engine.training.Model at 0x7f57a6d11650>,
 'model_scatter_data': <keras.engine.training.Model at 0x7f57a6cd7f50>,
 'model_inception': <keras.engine.training.Model at 0x7f57a00753d0>,
 'model_compact_data': <keras.engine.training.Model at 0x7f57a00c6dd0>,
 'model_simple': <keras.engine.training.Model at 0x7f57a6cd7450>,
 'model_resnext': <keras.engine.training.Model at 0x7f575643af90>}

In [39]:
test_data, x_test = data.new_x(arn_test)
test_data.shape, x_test.shape

Data frame arn_test shape: (7268, 6)
Sequences shape: (7268, 68, 4) - 4 classes
Structures shape: (7268, 68, 3) - 3 classes
Loops shape: (7268, 68, 7) - 7 classes
x_new shape: (7268, 68, 14)



((7268, 6), (7268, 68, 14))

In [25]:
def prediction(x_new, keras_models):
    y_new = {}
    for mdl in keras_models:
        mdl = 'model_simple'
        y_new[mdl] = keras_models[mdl].predict(x_new)
        break
    
    return y_new

predict = prediction(x_new, keras_models)

In [34]:
for mdl in predict:
    print(predict[mdl].shape)

(6639, 68, 5)


In [159]:
def traiter_predict_output(arn, predict):
    output = {}
    
    for i in range(0, len(predict), 2):
        seq_length = arn['seq_length'][i]
        prot = arn['id'][i]
    
        output[prot] = {'reactivity': [], 'deg_Mg_pH10': [], 'deg_pH10': [], 'deg_Mg_50C': [], 'deg_50C': []}
    
        if seq_length == 107:
            
            # Ajout des 39 premiers éléments du segment 1
            for index, key in enumerate(output[prot].keys()):
                for ele in predict[i][:,index][:39]:
                    output[prot][key].append(ele)
            
            # Ajout des 29 éléments en commun entre segment 1 & 2
            # Ici une moyenne des valeurs observées à une position donnée est faite
            for index, key in enumerate(output[prot].keys()):
                sum_ = np.add(predict[i][:,index][39:], predict[i+1][:,index][:29])
                mean_ = np.divide(sum_, 2)
                for ele in mean_:
                    output[prot][key].append(ele)
            
            # Ajout des 39 derniers éléments du segment 2
            for key in output[prot]:
                for ele in predict[i+1][:,index][29:]:
                    output[prot][key].append(ele)
            
        else:
            # Ajout des 62 premiers éléments du segment 1
            for index, key in enumerate(output[prot].keys()):
                for ele in predict[i][:,index][:62]:
                    output[prot][key].append(ele)
            
            # Ajout des 6 éléments en commun entre segment 1 & 2
            # Ici une moyenne des valeurs observées à une position donnée est faite
            for index, key in enumerate(output[prot].keys()):
                sum_ = np.add(predict[i][:,index][62:], predict[i+1][:,index][:6])
                mean_ = np.divide(sum_, 2)
                for ele in mean_:
                    output[prot][key].append(ele)
            
            # Ajout des 62 derniers éléments du segment 2
            for key in output[prot]:
                for ele in predict[i+1][:,index][6:]:
                    output[prot][key].append(ele)
    
    return output

output = traiter_output(test_data, y_train)

In [160]:
len(output)

1589

In [165]:
for prot in output:
    for i in range(len(output[prot]['reactivity'])):
        print("{}_{} {} {} {} {} {}".format(prot, i, 
                                            output[prot]['reactivity'][i],
                                            output[prot]['deg_Mg_pH10'][i],
                                            output[prot]['deg_pH10'][i],
                                            output[prot]['deg_Mg_50C'][i],
                                            output[prot]['deg_50C'][i]))
        break
    break

id_00073f8be_0 0.3297 0.7556 2.3375 0.35810000000000003 0.6382


In [169]:
def write_submission(output):
    with open('submission.csv', 'w') as filout:
        fields = ['id_seqpos', 'reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
        f_writer = csv.DictWriter(filout, fieldnames=fields)
        f_writer.writeheader()
    
        for prot in output:
            for i in range(len(output[prot]['reactivity'])):
                data = {
                    'id_seqpos': f"{prot}_{i}",
                    'reactivity': output[prot]['reactivity'][i], 
                    'deg_Mg_pH10': output[prot]['deg_Mg_pH10'][i], 
                    'deg_pH10': output[prot]['deg_pH10'][i], 
                    'deg_Mg_50C': output[prot]['deg_Mg_50C'][i], 
                    'deg_50C': output[prot]['deg_50C'][i]
                }
                f_writer.writerow(data)
write_submission(output)