In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
/kaggle/input/optiver-trading-at-the-close/train.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
/kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py


In [None]:
from typing import Sequence, Tuple

import pandas as pd
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

class MockApi:
    def __init__(self):
        self.input_paths: Sequence[str] = ['/kaggle/input/inputapi/Excel11.xlsx']  # actual path
        self.group_id_column: str = "group_id"  # Placeholder column name
        self.export_group_id_column: bool = False
        assert len(self.input_paths) >= 2

        self._status = 'initialized'
        self.predictions = []

    def iter_test(self) -> Tuple[pd.DataFrame]:
        if self._status != 'initialized':
            raise Exception('WARNING: the real API can only iterate over `iter_test()` once.')

        dataframes = []
        for pth in self.input_paths:
            dataframes.append(pd.read_csv(pth, low_memory=False))
        group_order = dataframes[0][self.group_id_column].drop_duplicates().tolist()
        dataframes = [df.set_index(self.group_id_column) for df in dataframes]

        for group_id in group_order:
            self._status = 'prediction_needed'
            current_data = []
            for df in dataframes:
                cur_df = df.loc[group_id].copy()
                if not isinstance(cur_df, pd.DataFrame):
                    cur_df = pd.DataFrame({a: b for a, b in zip(cur_df.index.values, cur_df.values)},index=[group_id])
                    cur_df.index.name = self.group_id_column
                cur_df = cur_df.reset_index(drop=not(self.export_group_id_column))
                current_data.append(cur_df)
            yield tuple(current_data)

            while self._status != 'prediction_received':
                print('You must call `predict()` successfully before you can continue with `iter_test()`', flush=True)
                yield None

        # Update the path where the submission file is saved
        submission_path = '/kaggle/working/submission.csv'
        with open(submission_path, 'w') as f_open:
            pd.concat(self.predictions).to_csv(f_open, index=False)
        self._status = 'finished'

    def predict(self, user_predictions: pd.DataFrame):
        if self._status == 'finished':
            raise Exception('You have already made predictions for the full test set.')
        if self._status != 'prediction_needed':
            raise Exception('You must get the next test sample from `iter_test()` first.')
        if not isinstance(user_predictions, pd.DataFrame):
            raise Exception('You must provide a DataFrame.')

        self.predictions.append(user_predictions)
        self._status = 'prediction_received'


def make_env():
    return MockApi()

# Define the paths to the input files
train_file = os.path.join('/kaggle/input/optiver-trading-at-the-close/train.csv')
test_file = os.path.join('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')
sample_submission_file = os.path.join('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')
revealed_targets_file = os.path.join('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')

# Step 1: Data Preparation
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)
revealed_targets = pd.read_csv(revealed_targets_file)

# Step 2: Preprocess Data
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Step 3: Feature Engineering
def calculate_synthetic_index(data):
    if 'synthetic_index' in data.columns:
        data['synthetic_index'] = ((data['wap'].shift(-60) / data['wap'])) * 10000
    return data

if 'synthetic_index' in train_data.columns:
    train_data = train_data.groupby(['stock_id', 'date_id']).apply(calculate_synthetic_index)
    train_data = train_data.drop(columns=['synthetic_index'])

if 'synthetic_index' in test_data.columns:
    test_data = test_data.groupby(['stock_id', 'date_id']).apply(calculate_synthetic_index)
    test_data = test_data.drop(columns=['synthetic_index'])

# Step 4: Model Selection
X = train_data.drop(['target'], axis=1)
y = train_data['target']

model = tf.keras.Sequential([
    tf.keras.layers.LSTM(32, activation='relu', input_shape=(X.shape[1], 1)),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

# Step 5: Training the Model
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = X_train_scaled[:, :, np.newaxis]

X_valid_scaled = scaler.transform(X_valid)
X_valid_scaled = X_valid_scaled[:, :, np.newaxis]

history = model.fit(X_train_scaled, y_train, epochs=10, validation_data=(X_valid_scaled, y_valid), batch_size=64)

validation_loss = model.evaluate(X_valid_scaled, y_valid)
print(f'Validation Loss: {validation_loss}')

test_data_scaled = scaler.transform(test_data)
test_data_scaled = test_data_scaled[:, :, np.newaxis]

# Step 7: Prepare Output for Submission
env = make_env()
iter_test = env.iter_test()

for (test, _, sample_prediction) in iter_test:
    if 'synthetic_index' in test.columns:
        test = test.groupby(['stock_id', 'date_id']).apply(calculate_synthetic_index)
        test = test.drop(columns=['synthetic_index'])

    test_data_scaled = scaler.transform(test)
    test_data_scaled = test_data_scaled[:, :, np.newaxis]

    predictions = model.predict(test_data_scaled)

    sample_prediction['target'] = predictions.flatten()
    env.predict(sample_prediction)
