In [29]:
import pandas as pd
import tensorflow as tf
import numpy as np

from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None)


# Fonctions

In [30]:
def create_benchmark_model():

    inputs = layers.Input(shape=(100, 19))

    gru_fwd = layers.GRU(64, return_sequences=False)(inputs)
    gru_bwd = layers.GRU(64, return_sequences=False, go_backwards=True)(inputs)

    concatenated = layers.concatenate([gru_fwd, gru_bwd])
    dense_1 = layers.Dense(64, activation='selu')(concatenated)
    dense_2 = layers.Dense(128, activation='selu')(dense_1)
    dense_3 = layers.Dense(128, activation='selu')(dense_2)
    dense_4 = layers.Dense(64, activation='selu')(dense_3)
    output = layers.Dense(24, activation='softmax')(dense_4)

    model = tf.keras.Model(inputs=inputs, outputs=output)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-3),
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
    
    return model

# Création de la base de donnée

In [31]:
X_train_intitial=pd.read_csv("/Users/aurelientarroux/Desktop/Projet_prog/Projet_2/Data/X_train_N1UvY30.csv")  
y_train_intitial=pd.read_csv("/Users/aurelientarroux/Desktop/Projet_prog/Projet_2/Data/y_train_or6m3Ta.csv") 
X_test_initial=pd.read_csv("/Users/aurelientarroux/Desktop/Projet_prog/Projet_2/Data/X_test_m4HAPAP.csv") 

In [32]:
def data_base(X):
    X['bid_ask_spread']=X['ask']-X['bid']
    X['trade']=X['trade'].astype(int)

    value_categor=['venue','action','side']

    for k in value_categor:

        X_encoded = pd.get_dummies(X[k], prefix=k).astype(int)
        X = X.merge( X_encoded , left_index = True , right_index = True)
        X.drop( columns=[k] , inplace = True )

    X.drop(columns=['obs_id','order_id'],inplace=True)


    value_no_center=['price' , 'bid' , 'ask' , 'bid_size', 'ask_size', 'bid_ask_spread' ]

    for k in value_no_center:
    
        X[k] = X[k] / max( X[k] )

    X = X.values.reshape(int(X.shape[0]/100), 100, 19)

    return X

# Test du modèle

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_base(X_train_intitial), y_train_intitial['eqt_code_cat'].values, test_size=0.2, random_state=42)

In [None]:
model = create_benchmark_model()
history = model.fit(X_train, y_train, batch_size = 1000, epochs = 10)

In [None]:
prediction_prob = model.predict(X_test)
y_pred = []
for k in prediction_prob :
    y_pred += [ k.argmax() ]

In [None]:
accuracy_score(y_pred,y_test)

# Mise en place du modèle

In [33]:
y_train = y_train_intitial['eqt_code_cat'].values
X_train = data_base(X_train_intitial)
X_test = data_base(X_test_initial)

In [34]:
model = create_benchmark_model()
history = model.fit(X_train, y_train, batch_size = 1000, epochs = 10)

Epoch 1/30
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 254ms/step - accuracy: 0.0684 - loss: 3.2742
Epoch 2/30
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 359ms/step - accuracy: 0.1609 - loss: 2.6223
Epoch 3/30
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 313ms/step - accuracy: 0.2272 - loss: 2.3975
Epoch 4/30
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 584ms/step - accuracy: 0.2671 - loss: 2.2739
Epoch 5/30
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 695ms/step - accuracy: 0.3108 - loss: 2.1413
Epoch 6/30
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 640ms/step - accuracy: 0.3367 - loss: 2.0524
Epoch 7/30
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 739ms/step - accuracy: 0.3635 - loss: 1.9783
Epoch 8/30
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 702ms/step - accuracy: 0.3886 - loss: 1.8945
Epoch 9/30


In [35]:
prediction_prob = model.predict(X_test)
y_pred_brut= []
for k in prediction_prob :
    y_pred_brut += [ k.argmax() ]

[1m2550/2550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step


In [36]:
y_pred = pd.DataFrame(y_pred_brut)
y_pred.reset_index(inplace=True)
y_pred.columns=y_train_intitial.columns
y_pred

Unnamed: 0,obs_id,eqt_code_cat
0,0,12
1,1,3
2,2,19
3,3,19
4,4,18
...,...,...
81595,81595,9
81596,81596,0
81597,81597,12
81598,81598,10


In [37]:
y_pred.to_csv('export_data_2.csv', index=False)