In [1]:
import pandas as pd 
import numpy as np 
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout
from tensorflow.keras.activations import linear, relu, sigmoid
from modules.first_model import change_X_data_1, change_y_data, change_X_data_2



In [2]:
X = pd.read_csv("/Users/elouan/Data/Data_ENSxCFM/X_train.csv")
y = pd.read_csv("/Users/elouan/Data/Data_ENSxCFM/y_train.csv")

Categorical Features: action and side are object types. We'll need to convert them to numerical values (e.g., one-hot encoding) before feeding them into the model. The trade feature is a boolean and can also be treated as a binary numeric value.

For **trade** :

False = 0, True = 1

For **side** :

A = 0 , B = 1

For **action** :
- **A**: \([1, 0, 0]\)
- **D**: \([0, 1, 0]\)
- **U**: \([0, 0, 1]\)


# One-Hot Encoding

## Introduction
One-Hot Encoding is a technique used to convert categorical variables into a format that can be provided to machine learning algorithms. This process helps to represent categorical data as binary vectors.

## Definition
Given a categorical variable with \( n \) unique categories, One-Hot Encoding transforms it into \( n \) binary variables, where each variable represents one category. If the category is present, it is represented by \( 1 \) (hot), and if it is absent, it is represented by \( 0 \) (cold).

## Example
Consider a categorical variable, **action**, with three categories: **A**, **D**, and **U**. The One-Hot Encoding for this variable would be:

- **A**: \([1, 0, 0]\)
- **D**: \([0, 1, 0]\)
- **U**: \([0, 0, 1]\)

Each vector corresponds to one of the categories.

This results in three new binary columns representing the original categorical variable.

## Purpose
One-Hot Encoding is essential in machine learning because:

- It prevents the algorithm from interpreting categorical variables as ordinal data.
- It allows for the inclusion of categorical data in algorithms that require numerical input.
- It helps improve model performance by providing a clearer representation of categorical variables.

## Conclusion
One-Hot Encoding is a straightforward and effective technique for preprocessing categorical data in machine learning. By transforming categories into binary vectors, it allows algorithms to better interpret and utilize this data.


In [3]:
X2 = change_X_data_2(X)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [4]:
X2.head()

Unnamed: 0,column 1
0,"[[4.0, 0.0, 0.0, 1.0, 3.0, 0.0, 1.0, 4.6151204..."
1,"[[4.0, 0.0, 0.0, 0.0, -0.8, 0.0, 4.0, 1.098612..."
2,"[[4.0, 0.0, 0.0, 1.0, 0.1, 0.0, 1.0, 6.947937,..."
3,"[[4.0, 0.0, 0.0, 1.0, 2.1, 0.0, 20.0, 6.527958..."
4,"[[4.0, 0.0, 1.0, 1.0, 0.1, 0.0, 1.0, 6.302619,..."


In [5]:
# Access value in row 2 (index), column 'column_name'
value = X2.at[2, 'column 1']  # or df.loc[2, 'column_name']
print(value)


[[   4.           0.           0.        ...    7.7836404    0.
   100.       ]
 [   4.           1.           0.        ...    7.8244457    0.
   100.       ]
 [   4.           1.           1.        ...    7.7836404    0.
  -100.       ]
 ...
 [   4.          69.           1.        ...    5.9939613    0.
  -100.       ]
 [   4.          76.           0.        ...    5.9939613    0.
    10.       ]
 [   4.          77.           0.        ...    5.9939613    0.
   100.       ]]


In [6]:
X2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160800 entries, 0 to 160799
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   column 1  160800 non-null  object
dtypes: object(1)
memory usage: 1.2+ MB


In [38]:
X3 = np.array([np.array(row) for row in X2["column 1"]])

In [12]:
X3[0][1]

array([  4.       ,   1.       ,   0.       ,   0.       ,  -1.7      ,
         0.       ,   1.       ,   4.6151204,   0.6931472,   0.       ,
       100.       ], dtype=float32)

In [48]:
X3.shape

(160800, 100, 11)

In [7]:
#y = change_y_data(y)
y1 = y.drop(columns=["obs_id"])

y1.head()

Unnamed: 0,eqt_code_cat
0,10
1,15
2,0
3,13
4,0


In [8]:
y1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160800 entries, 0 to 160799
Data columns (total 1 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   eqt_code_cat  160800 non-null  int64
dtypes: int64(1)
memory usage: 1.2 MB


In [47]:
y2 = np.array(y1["eqt_code_cat"])
y2.shape

(160800,)

In [64]:
print(np.any(np.isnan(X3)))  # Check features for NaNs
print(np.any(np.isnan(y2)))

False
False


In [63]:
X3 = np.nan_to_num(X3)

In [65]:
dataset = tf.data.Dataset.from_tensor_slices((X3, y2))

In [66]:
for x, y in dataset.take(3):  # Use `.take()` to limit output to 3 samples
    print("Features:", x.numpy())
    print("Label:", y.numpy())


Features: [[   4.           0.           0.        ...    0.6931472    0.
   100.       ]
 [   4.           1.           0.        ...    0.6931472    0.
   100.       ]
 [   4.           2.           1.        ...    0.6931472    0.
  -100.       ]
 ...
 [   4.          64.           0.        ...    5.739793     0.
   100.       ]
 [   4.          63.           1.        ...    5.739793     0.
  -100.       ]
 [   4.          65.           0.        ...    5.739793     0.
   100.       ]]
Label: 10
Features: [[   4.           0.           0.        ...    4.3307333    0.
   100.       ]
 [   4.           1.           1.        ...    4.3307333    0.
   -20.       ]
 [   4.           2.           0.        ...    4.3307333    0.
   100.       ]
 ...
 [   4.          60.           0.        ...    4.6151204    0.
    10.       ]
 [   2.          61.           0.        ...    4.6151204    0.
    10.       ]
 [   4.          37.           1.        ...    4.6151204    0.
  -100.       ]

In [67]:
print(dataset.element_spec)


(TensorSpec(shape=(100, 11), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


In [68]:
batch_size = 32
dataset = dataset.shuffle(buffer_size=len(X3))  # Shuffling the dataset



In [69]:
dataset = dataset.batch(batch_size)

In [70]:
for x, y in dataset.take(1):
    print(f"Features batch shape: {x.shape}, Labels batch shape: {y.shape}")

Features batch shape: (32, 100, 11), Labels batch shape: (32,)


In [71]:
tf.random.set_seed(1234) # for consistent results
model = Sequential(
    [   GRU(64, return_sequences=True, input_shape = (100, 11) ),            
        Dropout(0.2),
        GRU(64),
        Dropout(0.2),
        Dense(32, activation = 'relu',   name = "L1"),
        Dense(24, activation = 'softmax',   name = "output")
        
        
    ], name = "first_multiclass_model" 
)

In [72]:
model.summary()

In [104]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

history = model.fit(
    dataset,
    epochs=50,
)

Epoch 1/50
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 71ms/step - accuracy: 0.5480 - loss: 1.3203
Epoch 2/50
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m342s[0m 68ms/step - accuracy: 0.5576 - loss: 1.2901
Epoch 3/50
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m358s[0m 71ms/step - accuracy: 0.5645 - loss: 1.2781
Epoch 4/50
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m358s[0m 71ms/step - accuracy: 0.5668 - loss: 1.2677
Epoch 5/50
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m349s[0m 69ms/step - accuracy: 0.5763 - loss: 1.2412
Epoch 6/50
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 70ms/step - accuracy: 0.5813 - loss: 1.2227
Epoch 7/50
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 70ms/step - accuracy: 0.5863 - loss: 1.2095
Epoch 8/50
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m350s[0m 69ms/step - accuracy: 0.5888 - loss: 1.2010


In [105]:
model.save("../models/model_matrix_2.h5")  # Saves the model as an HDF5 file




In [75]:
x_test = pd.read_csv("/Users/elouan/Data/Data_ENSxCFM/X_test.csv")

In [None]:
x_test.info()


In [103]:
x_test.head()


Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
0,0,4,0,0,1,1.5,0.0,15.0,6.238325,4.615121,0,100
1,0,2,1,1,1,1.6,0.0,15.0,6.238325,4.615121,0,-100
2,0,4,2,1,1,16.3,0.0,15.0,6.238325,4.615121,0,-100
3,0,4,3,0,1,16.2,0.0,15.0,6.238325,4.615121,0,100
4,0,2,4,0,1,1.5,0.0,15.0,6.238325,5.303305,0,100


In [76]:
x_test_2 = change_X_data_2(x_test)
x_test_2.head()

Unnamed: 0,column 1
0,"[[4.0, 0.0, 0.0, 1.0, 1.5, 0.0, 15.0, 6.238324..."
1,"[[5.0, 0.0, 0.0, 1.0, 0.4, 0.0, 4.0, 3.7841897..."
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 11.0, 6.456769..."
3,"[[4.0, 0.0, 1.0, 0.0, -7.0, 0.0, 8.0, 2.302585..."
4,"[[0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 2.0, 7.189922..."


In [77]:
x_test_3 = np.array([np.array(row) for row in x_test_2["column 1"]])

In [101]:
x_test_3[0][0]

array([  4.       ,   0.       ,   0.       ,   1.       ,   1.5      ,
         0.       ,  15.       ,   6.2383246,   4.6151204,   0.       ,
       100.       ], dtype=float32)

In [80]:
x_test_3.shape

(81600, 100, 11)

In [84]:
y_test = y2[0:81600]

In [85]:
y_test.shape

(81600,)

In [86]:
dataset_test = tf.data.Dataset.from_tensor_slices((x_test_3, y_test))

In [100]:
for x, y in dataset_test.take(3):  # Use `.take()` to limit output to 3 samples
    print("Features:", x.numpy())
    print("Label:", y.numpy())

Features: [[[   3.           0.           0.        ...    5.2832036    0.
     59.       ]
  [   3.           1.           0.        ...    5.2832036    0.
    100.       ]
  [   5.           2.           0.        ...    5.2832036    0.
    100.       ]
  ...
  [   4.          60.           1.        ...    0.6931472    0.
     -1.       ]
  [   4.          57.           1.        ...    5.924256     0.
   -100.       ]
  [   2.          64.           0.        ...    5.924256     0.
    100.       ]]

 [[   4.           0.           1.        ...    5.5947113    0.
   -200.       ]
  [   4.           1.           1.        ...    5.1298985    0.
   -100.       ]
  [   4.           2.           0.        ...    5.1298985    0.
    300.       ]
  ...
  [   1.          71.           0.        ...    7.2950563    0.
     45.       ]
  [   5.          72.           0.        ...    7.3607397    0.
    100.       ]
  [   5.          73.           0.        ...    7.386471     0.
     41. 

In [93]:
batch_size = 32
dataset_test = dataset_test.shuffle(buffer_size=len(x_test_3))  # Shuffling the dataset



In [95]:
dataset_test = dataset_test.batch(batch_size)

In [107]:
test_loss, test_accuracy = model.evaluate(dataset_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 15ms/step - accuracy: 0.6942 - loss: 0.8825
Test Loss: 0.879999041557312, Test Accuracy: 0.6941915154457092


In [102]:
# Reshape it to (1, 100, 11) to match the model input shape
single_sequence = np.expand_dims(x_test_3[0], axis=0)

# Make a prediction with the trained model
prediction = model.predict(single_sequence)
yhat = np.argmax(prediction)

print("Prediction for the single sequence:", yhat)
print("expected : ", y2[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Prediction for the single sequence: 13
expected :  10


In [None]:
# n = 14

# prediction = model.predict(np.array(x_test_3.iloc[n]).reshape(1,12))
# prediction_p = tf.nn.softmax(prediction)
# yhat = np.argmax(prediction_p)
# print(prediction_p)
# print("prediction :")
# print(yhat)
# print("expected value : ")
# print(y.iloc[n])

In [None]:
# m = 1000
# S = 0

# for i in range(m):
    
#     prediction = model.predict(np.array(x_test.iloc[i]).reshape(1,12))
#     prediction_p = tf.nn.softmax(prediction)
#     yhat = np.argmax(prediction_p)

#     if yhat == y["eqt_code_cat"].iloc[i] :
#         S+=1
    
# print(f"{(S/m)*100}% of precision")

    