In [56]:
import pandas as pd 
import numpy as np 
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout
from tensorflow.keras.activations import linear, relu, sigmoid
from modules.first_model import change_X_data_1, change_y_data



In [2]:
X = pd.read_csv("/Users/elouan/Data/Data_ENSxCFM/X_train.csv")
y = pd.read_csv("/Users/elouan/Data/Data_ENSxCFM/y_train.csv")

Categorical Features: action and side are object types. We'll need to convert them to numerical values (e.g., one-hot encoding) before feeding them into the model. The trade feature is a boolean and can also be treated as a binary numeric value.

For **trade** :

False = 0, True = 1

For **side** :

A = 0 , B = 1

For **action** :
- **A**: \([1, 0, 0]\)
- **D**: \([0, 1, 0]\)
- **U**: \([0, 0, 1]\)


# One-Hot Encoding

## Introduction
One-Hot Encoding is a technique used to convert categorical variables into a format that can be provided to machine learning algorithms. This process helps to represent categorical data as binary vectors.

## Definition
Given a categorical variable with \( n \) unique categories, One-Hot Encoding transforms it into \( n \) binary variables, where each variable represents one category. If the category is present, it is represented by \( 1 \) (hot), and if it is absent, it is represented by \( 0 \) (cold).

## Example
Consider a categorical variable, **action**, with three categories: **A**, **D**, and **U**. The One-Hot Encoding for this variable would be:

- **A**: \([1, 0, 0]\)
- **D**: \([0, 1, 0]\)
- **U**: \([0, 0, 1]\)

Each vector corresponds to one of the categories.

This results in three new binary columns representing the original categorical variable.

## Purpose
One-Hot Encoding is essential in machine learning because:

- It prevents the algorithm from interpreting categorical variables as ordinal data.
- It allows for the inclusion of categorical data in algorithms that require numerical input.
- It helps improve model performance by providing a clearer representation of categorical variables.

## Conclusion
One-Hot Encoding is a straightforward and effective technique for preprocessing categorical data in machine learning. By transforming categories into binary vectors, it allows algorithms to better interpret and utilize this data.


In [3]:
X = change_X_data_1(X)
X.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
0,0,3.17,32.33,0.49,0.48,0.535,0.0,1.52,5.065281,3.316211,0.0,2.99
1,1,3.51,29.13,0.59,0.32,-0.257,1.34,5.26,4.393798,5.215185,0.03,2.8
2,2,3.44,38.32,0.46,0.56,0.12,0.0,1.07,7.417132,6.453014,0.01,9.0
3,3,2.6,34.08,0.51,0.84,1.797,6.9,20.0,4.133141,4.470712,0.0,1.32
4,4,3.01,36.4,0.52,0.53,0.193,0.97,2.71,4.639798,6.414855,0.0,-10.59


In [4]:
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160800 entries, 0 to 160799
Data columns (total 12 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   obs_id    160800 non-null  int64  
 1   venue     160800 non-null  float64
 2   order_id  160800 non-null  float64
 3   action    160800 non-null  float64
 4   side      160800 non-null  float64
 5   price     160800 non-null  float64
 6   bid       160800 non-null  float64
 7   ask       160800 non-null  float64
 8   bid_size  160800 non-null  float64
 9   ask_size  160800 non-null  float64
 10  trade     160800 non-null  float64
 11  flux      160800 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 14.7 MB


In [5]:
#y = change_y_data(y)
y = y.drop(columns=["obs_id"])

y.head()

Unnamed: 0,eqt_code_cat
0,10
1,15
2,0
3,13
4,0


In [6]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160800 entries, 0 to 160799
Data columns (total 1 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   eqt_code_cat  160800 non-null  int64
dtypes: int64(1)
memory usage: 1.2 MB


In [88]:
tf.random.set_seed(1234) # for consistent results
model = Sequential(
    [   GRU(64, return_sequences=True, input_shape = ((X.shape[1]), 1) ),            
        Dropout(0.2),
        GRU(64),
        Dropout(0.2),
        Dense(120, activation = 'relu',   name = "L1"),
        Dense(48, activation = 'relu',   name = "L2"),
        Dense(24, activation = 'softmax',   name = "L4")
        
        
    ], name = "first_multiclass_model" 
)

  super().__init__(**kwargs)


In [89]:
model.summary()

In [90]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.006),
    metrics=['accuracy']
)

history = model.fit(
    X,y,
    epochs=10,
)

Epoch 1/10
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 7ms/step - loss: 2.6462
Epoch 2/10
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 7ms/step - loss: 2.5194
Epoch 3/10
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 7ms/step - loss: 2.5198
Epoch 4/10
[1m3909/5025[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m8s[0m 8ms/step - loss: 2.5326

KeyboardInterrupt: 

In [91]:
model.save("../models/model_mean1.h5")  # Saves the model as an HDF5 file




In [70]:
x_test = pd.read_csv("/Users/elouan/Data/Data_ENSxCFM/X_test.csv")

In [None]:
x_test.info()


In [None]:
x_test.head()


In [71]:
x_test = convert_X_data(x_test)
x_test.head()

Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
0,0,3.42,25.49,0.5,0.72,5.225,0.0,15.48,6.369905,4.952439,0.0,0.0
1,1,2.62,36.48,0.54,0.97,0.993,0.0,4.0,3.78419,6.631546,0.0,-2.79
2,2,1.64,28.69,0.47,0.5,3.889,0.76,11.0,2.322319,6.330219,0.0,1.64
3,3,2.85,21.91,0.58,0.3,-13.518,4.83,8.86,2.952731,2.874551,0.0,-5.39
4,4,3.34,33.67,0.51,0.4,-2.852,0.0,2.0,7.744464,6.856844,0.0,-3.99


In [72]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81600 entries, 0 to 81599
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   obs_id    81600 non-null  int64  
 1   venue     81600 non-null  float64
 2   order_id  81600 non-null  float64
 3   action    81600 non-null  float64
 4   side      81600 non-null  float64
 5   price     81600 non-null  float64
 6   bid       81600 non-null  float64
 7   ask       81600 non-null  float64
 8   bid_size  81600 non-null  float64
 9   ask_size  81600 non-null  float64
 10  trade     81600 non-null  float64
 11  flux      81600 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 7.5 MB


In [86]:
n = 14

prediction = model.predict(np.array(x_test.iloc[n]).reshape(1,12))
prediction_p = tf.nn.softmax(prediction)
yhat = np.argmax(prediction_p)
print(prediction_p)
print("prediction :")
print(yhat)
print("expected value : ")
print(y.iloc[n])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
tf.Tensor(
[[2.83120840e-04 7.51730427e-02 9.15032327e-02 1.48889367e-02
  2.97237444e-03 4.28154804e-02 1.13305867e-01 9.53647941e-02
  5.94788529e-02 1.16557974e-04 5.41981636e-03 2.39028260e-02
  3.55553515e-02 1.02358788e-01 4.91270870e-02 3.55676636e-02
  1.48213119e-04 6.29185932e-04 4.45173384e-04 1.27164468e-01
  7.28455465e-03 2.74219830e-03 4.00100660e-04 1.13352306e-01]], shape=(1, 24), dtype=float32)
prediction :
19
expected value : 
eqt_code_cat    14
Name: 14, dtype: int64


In [92]:
m = 1000
S = 0

for i in range(m):
    
    prediction = model.predict(np.array(x_test.iloc[i]).reshape(1,12))
    prediction_p = tf.nn.softmax(prediction)
    yhat = np.argmax(prediction_p)

    if yhat == y["eqt_code_cat"].iloc[i] :
        S+=1
    
print(f"{(S/m)*100}% of precision")

    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1