In [21]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid



In [16]:
X = pd.read_csv("../data/X_train.csv")
y = pd.read_csv("../data/y_train.csv")

Categorical Features: action and side are object types. We'll need to convert them to numerical values (e.g., one-hot encoding) before feeding them into the model. The trade feature is a boolean and can also be treated as a binary numeric value.

For **trade** :

False = 0, True = 1

For **side** :

A = 0 , B = 1

For **action** :
- **A**: \([1, 0, 0]\)
- **D**: \([0, 1, 0]\)
- **U**: \([0, 0, 1]\)


# One-Hot Encoding

## Introduction
One-Hot Encoding is a technique used to convert categorical variables into a format that can be provided to machine learning algorithms. This process helps to represent categorical data as binary vectors.

## Definition
Given a categorical variable with \( n \) unique categories, One-Hot Encoding transforms it into \( n \) binary variables, where each variable represents one category. If the category is present, it is represented by \( 1 \) (hot), and if it is absent, it is represented by \( 0 \) (cold).

## Example
Consider a categorical variable, **action**, with three categories: **A**, **D**, and **U**. The One-Hot Encoding for this variable would be:

- **A**: \([1, 0, 0]\)
- **D**: \([0, 1, 0]\)
- **U**: \([0, 0, 1]\)

Each vector corresponds to one of the categories.

This results in three new binary columns representing the original categorical variable.

## Purpose
One-Hot Encoding is essential in machine learning because:

- It prevents the algorithm from interpreting categorical variables as ordinal data.
- It allows for the inclusion of categorical data in algorithms that require numerical input.
- It helps improve model performance by providing a clearer representation of categorical variables.

## Conclusion
One-Hot Encoding is a straightforward and effective technique for preprocessing categorical data in machine learning. By transforming categories into binary vectors, it allows algorithms to better interpret and utilize this data.


In [33]:

grouped_dfs = {obs_id : df_group for obs_id, df_group in X.groupby('obs_id')}


In [34]:
X_0 = grouped_dfs[0]
X_1 = grouped_dfs[1]

In [35]:
X_0.head()

Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
0,0,4,0,A,A,0.3,0.0,0.01,100,1,False,100
1,0,4,1,A,B,-0.17,0.0,0.01,100,1,False,100
2,0,4,2,D,A,0.28,0.0,0.01,100,1,False,-100
3,0,4,3,A,A,0.3,0.0,0.01,100,1,False,100
4,0,4,4,D,A,0.37,0.0,0.01,100,1,False,-100


In [36]:
X_1.head()

Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
100,1,4,0,A,B,-0.08,0.0,0.04,2,75,False,100
101,1,4,1,D,A,0.17,0.0,0.04,2,75,False,-20
102,1,4,2,A,A,0.2,0.0,0.04,2,75,False,100
103,1,0,3,A,A,0.15,0.0,0.04,2,75,False,100
104,1,0,4,A,A,0.15,0.0,0.04,2,75,False,100


In [37]:
X_0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   obs_id    100 non-null    int64  
 1   venue     100 non-null    int64  
 2   order_id  100 non-null    int64  
 3   action    100 non-null    object 
 4   side      100 non-null    object 
 5   price     100 non-null    float64
 6   bid       100 non-null    float64
 7   ask       100 non-null    float64
 8   bid_size  100 non-null    int64  
 9   ask_size  100 non-null    int64  
 10  trade     100 non-null    bool   
 11  flux      100 non-null    int64  
dtypes: bool(1), float64(3), int64(6), object(2)
memory usage: 9.5+ KB


In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16080000 entries, 0 to 16079999
Data columns (total 12 columns):
 #   Column    Dtype  
---  ------    -----  
 0   obs_id    int64  
 1   venue     int64  
 2   order_id  int64  
 3   action    object 
 4   side      object 
 5   price     float64
 6   bid       float64
 7   ask       float64
 8   bid_size  int64  
 9   ask_size  int64  
 10  trade     bool   
 11  flux      int64  
dtypes: bool(1), float64(3), int64(6), object(2)
memory usage: 1.3+ GB


In [19]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160800 entries, 0 to 160799
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   obs_id        160800 non-null  int64
 1   eqt_code_cat  160800 non-null  int64
dtypes: int64(2)
memory usage: 2.5 MB


In [12]:
tf.random.set_seed(1234) # for consistent results
model = Sequential(
    [               
        ### START CODE HERE ### 
        tf.keras.Input(shape=(11,)),
        Dense(25, activation = 'relu',   name = "L1"),
        Dense(15, activation = 'relu',   name = "L2"),
        Dense(24, activation = 'linear',   name = "L3")
        
        
        ### END CODE HERE ### 
    ], name = "first_multiclass_model" 
)

In [15]:
model.summary()

In [29]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
)

history = model.fit(
    X_0,y,
    epochs=40
)

ValueError: could not convert string to float: 'A'