<a href="https://colab.research.google.com/github/AbelKosh/Jane-Street-Challenge/blob/main/Initial_Autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, gc
import joblib
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Preprocessing

In [None]:
# Optimze for memory

def reduce_mem_usage(self, float16_as32=True):
    start_mem = df.memory_usage().sum() / 1024**2  # Calculate intial memory
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')

    for col in df.columns: # Itterate through each column
        col_type = df[col].dtype
        if col_type != object and str(col_type)!='category': # Accepts only numerical types
            c_min,c_max = df[col].min(),df[col].max()
            if str(col_type)[:3] == 'int': # For int type variable
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8) # If the range is between -128 to 127, convert to int8
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16) # Between -32,768 to 32,767, convert to int16
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32) # Between -2,147,483,648 to 2,147,483,647, convert to int32
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  # Between -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807, convert to int64
            else: # For floating point type
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32: # If the data needs higher accuracy, choose float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)
                # If the value is within the value range of float32, convert
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                # If the value is within the value range of float64, convert
                else:
                    df[col] = df[col].astype(np.float64)

    # Calculate the memory after the end
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    # Calculate the percentage by which memory has been reduced
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
###############################################################################
# Step 1: Load and Prepare Data
###############################################################################

# Define the path to the input data directory
# If the local directory exists, use it; otherwise, use the Kaggle input directory
input_path =  '/content/drive/MyDrive/Jane Street Challenge'

# Create a directory to store the trained models
os.system('mkdir models')

# Flag to determine if the script is in training mode or not
TRAINING = True

# Define the feature names based on the number of features (79 in this case)
feature_names = [f"feature_{i:02d}" for i in range(79)]
target_col = "responder_6"

# Number of dates to skip from the beginning of the dataset
skip_dates = 500

# Number of validation dates to use
num_valid_dates = 100

# If in training mode, load the training data
if TRAINING:
    # Load the training data from a Parquet file
    df = pd.read_parquet(f'{input_path}/train.parquet')

    # Forward Fill values
    df[feature_names] = df[feature_names].ffill().fillna(0)

    # Reduce memory usage of the DataFrame (function not provided here)
    df = reduce_mem_usage(df, False)

    # Filter the DataFrame to include only dates greater than or equal to skip_dates
    df = df[df['date_id'] >= skip_dates].reset_index(drop=True)

    # Get unique dates from the DataFrame
    dates = df['date_id'].unique()

    # Define validation dates as the last `num_valid_dates` dates
    valid_dates = dates[-num_valid_dates:]

    # Define training dates as all dates except the last `num_valid_dates` dates
    train_dates = dates[:-num_valid_dates]

    # Display the last few rows of the DataFrame (for debugging purposes)
    print(df.tail())

Memory usage of dataframe is 15910.22 MB
Memory usage after optimization is: 8179.83 MB
Decreased by 48.6%
          date_id  time_id  symbol_id    weight  feature_00  feature_01  \
39577176     1698      967         34  3.242188    2.525391   -0.722168   
39577177     1698      967         35  1.079102    1.857422   -0.790527   
39577178     1698      967         36  1.033203    2.515625   -0.672363   
39577179     1698      967         37  1.243164    2.664062   -0.889160   
39577180     1698      967         38  3.193359    2.728516   -0.745117   

          feature_02  feature_03  feature_04  feature_05  ...  responder_0  \
39577176    2.544922    2.478516    0.417480    0.785645  ...     0.243530   
39577177    2.746094    2.339844    0.845215    0.651367  ...     0.850098   
39577178    2.289062    2.521484    0.255127    0.919922  ...     0.395752   
39577179    2.312500    3.101562    0.324463    0.619141  ...     1.925781   
39577180    2.789062    2.343750    0.454834    0.86

In [None]:
# Set values

X = df[feature_names].values
y = df['responder_6'].values
days = df['date_id'].values
weights = df['weight'].values

In [None]:
#try with f_32/f_64
#weights = weights / np.sum(weights)

  weights = weights / np.sum(weights)


In [None]:
weights /= weights.max()

In [None]:
import tensorflow as tf
from tensorflow.config import threading

# Configure TensorFlow to use all CPU cores
threading.set_intra_op_parallelism_threads(8)  # 0 lets TensorFlow decide


# Verify the configuration
print(f"Inter-op threads: {threading.get_inter_op_parallelism_threads()}")

'''
# 1. Detect TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU:', tpu.master())
except ValueError:
    tpu = None
    print('TPU not found. Using CPU/GPU.')

# 2. Initialize TPU
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("Number of replicas:", strategy.num_replicas_in_sync)
'''


Inter-op threads: 0


'\n# 1. Detect TPU\ntry:\n    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n    print(\'Running on TPU:\', tpu.master())\nexcept ValueError:\n    tpu = None\n    print(\'TPU not found. Using CPU/GPU.\')\n\n# 2. Initialize TPU\nif tpu:\n    tf.config.experimental_connect_to_cluster(tpu)\n    tf.tpu.experimental.initialize_tpu_system(tpu)\n    strategy = tf.distribute.TPUStrategy(tpu)\nelse:\n    strategy = tf.distribute.get_strategy() \n\nprint("Number of replicas:", strategy.num_replicas_in_sync)\n'

In [None]:
X_valid = df[feature_names].loc[df['date_id'].isin(valid_dates)].values
X_train = df[feature_names].loc[df['date_id'].isin(train_dates)].values

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam


# Define architecture

input_dim = X.shape[1]

input_layer = Input(shape=(input_dim,))

encoder_layer = Dense(32, activation='relu')(input_layer) # Hidden
encoder_layer = Dense(16, activation='relu')(encoder_layer) # Hidden

encoder_layer = Dense(4, activation='relu')(encoder_layer) # Bottleneck

decoder_layer = Dense(16, activation='relu')(encoder_layer) # Hidden
decoder_layer = Dense(32, activation='relu')(decoder_layer) # Hidden

output_layer = Dense(input_dim, activation='linear')(decoder_layer) # Reconstruct Output

autoencoder = Model(inputs=input_layer, outputs=output_layer)

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

In [None]:
autoencoder.fit(
    X_train, X_train,
    epochs=50,
    batch_size=8192,
    validation_data=[X_valid, X_valid]
    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7c5b78173c70>

In [None]:
# Reconstruct the input
X_reconstructed = autoencoder.predict(X)

# Compute reconstruction error
reconstruction_error = np.mean(np.square(X - X_reconstructed), axis=1)
print("Reconstruction Error (first 10 samples):", reconstruction_error[:10])

Reconstruction Error (first 10 samples): [1.0698845 1.2499732 1.3263661 1.1549952 1.5845288 1.0523058 0.8955766
 1.2136109 1.1093584 2.0604274]


In [None]:
# Reconstruct the input
X_reconstruct = autoencoder.predict(X_valid)

# Compute reconstruction error
reconstruction_error = np.mean(np.square(X_valid - X_reconstruct), axis=1)
print("Reconstruction Error (first 10 samples):", reconstruction_error[:10])

Reconstruction Error (first 10 samples): [1.0294298 1.3772562 1.1812094 0.9509004 1.2838373 1.359329  2.2602086
 1.0162623 1.0174031 1.7014194]


In [None]:
# Extract the encoder model
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer(index=2).output)  # Assuming the bottleneck is layer 2

# Get latent representations
latent_representations = encoder.predict(X)
print("Latent Representations (first 5 samples):")
print(latent_representations[:5])

Latent Representations (first 5 samples):
[[ 7.4739966   0.          0.          7.1995497   5.4352612  32.019962
   0.          7.883161    1.1627146   4.2156353   0.          4.0005655
   6.112464   15.459115    1.6614232   0.        ]
 [ 0.          8.884517   12.341624   17.718355    0.         31.952633
   0.         51.99388    14.388683    7.340246    0.         18.85402
  10.702978    0.         15.232538    4.894465  ]
 [ 2.440923    0.          0.          4.1892395   2.2123868   5.1951084
   0.          3.0551047   2.0659585   3.0791738   0.          3.9523578
   1.3333149   7.36479     0.          0.        ]
 [ 6.693313    0.          0.         18.836302    2.10054    64.50085
   0.          0.3724654   0.6031202  10.317776    0.          1.6614748
   4.7767944  13.1036825   0.          0.        ]
 [ 7.7207236   0.          0.          7.7659307   5.916567   26.937994
   0.          2.6678982   0.77926576  9.121556    0.          7.0814943
   8.430646   15.732739    1.73

In [None]:
# Set threshold (e.g., mean + 2*std of reconstruction error)
threshold = np.mean(reconstruction_error) + 2 * np.std(reconstruction_error)

# Identify anomalies
anomalies = np.where(reconstruction_error > threshold)[0]
print("Anomalous Samples Indices:", anomalies)

Anomalous Samples Indices: [  36845   54437   54496 ... 3709163 3711486 3711854]


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

# Flatten for metric computation
X_flat = X.reshape(-1, X.shape[-1])
X_reconstructed_flat = X_reconstructed.reshape(-1, X.shape[-1])

# MAE and R^2
mae = mean_absolute_error(X_flat, X_reconstructed_flat)
r2 = r2_score(X_flat, X_reconstructed_flat)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")

Mean Absolute Error (MAE): 0.5047721862792969
R^2 Score: 0.334540992975235
