In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('C:\Users\Aaryan\Downloads\archive (5)'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (635778173.py, line 12)

In [9]:
# --------------------------------------------------------------------------------
# FULL SCRIPT: Predicting jet1_x, jet1_y, jet2_x, jet2_y from a complex-valued
# "received_signal" sequence using a 1D-CNN regression model in Keras.
# --------------------------------------------------------------------------------

import pandas as pd
import numpy as np
import ast
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv1D, BatchNormalization, Activation,
    MaxPooling1D, GlobalMaxPool1D, Dense, Dropout
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ---------------------------
# 1) Load and parse the CSV
# ---------------------------

# Replace "data.csv" with the actual filename (make sure it's uploaded in Kaggle).
df = pd.read_csv(r"C:\Users\Aaryan\Downloads\archive (5)\dataset.csv")

# We drop "secret_code" because it's constant (no variability).
if "secret_code" in df.columns:
    df = df.drop(columns=["secret_code"])

# Parse "received_signal" strings into Python lists of complex numbers.
# We assume each row in "received_signal" is something like:
#   "[(0.1193-0.1883j), (0.8577-0.5676j), ...]"
# If your CSV already stores them as real Python lists, you can skip literal_eval.

def parse_complex_list(cell):
    # e.g. cell = "[(0.1193-0.1883j), (0.8577-0.5676j), ...]"
    try:
        # ast.literal_eval will convert "1+2j" into a complex(1,2)
        return ast.literal_eval(cell)
    except Exception:
        # If already a Python list of complex numbers, return as-is
        return cell

df["received_signal"] = df["received_signal"].apply(parse_complex_list)

# Now split each complex-valued sequence into two parallel sequences:
#   one for real parts, one for imag parts.
real_seqs = []
imag_seqs = []

for seq in df["received_signal"]:
    reals = [c.real for c in seq]
    imags = [c.imag for c in seq]
    real_seqs.append(reals)
    imag_seqs.append(imags)

# Determine maximum sequence length
max_len = max(len(x) for x in real_seqs)

# Pad all sequences (both real and imag) to shape (max_len) with zeros at the end
real_padded = pad_sequences(real_seqs, maxlen=max_len, dtype="float32", padding="post", truncating="post")
imag_padded = pad_sequences(imag_seqs, maxlen=max_len, dtype="float32", padding="post", truncating="post")

# Stack real/imag along the last axis → shape = (n_samples, max_len, 2)
X = np.stack([real_padded, imag_padded], axis=-1)

# ----------------------------------------
# 2) Prepare regression targets and split
# ----------------------------------------

# Targets: jet1_x, jet1_y, jet2_x, jet2_y
y = df[["jet1_x", "jet1_y", "jet2_x", "jet2_y"]].values.astype("float32")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Test samples:    ", X_test.shape[0])
print("Input tensor shape:", X_train.shape)  # (n_samples, max_len, 2)
print("Output tensor shape:", y_train.shape)  # (n_samples, 4)

# -----------------------------
# 3) Build the 1D-CNN regression
# -----------------------------

input_layer = Input(shape=(max_len, 2), name="received_signal_input")

# Block 1: Conv1D → BatchNorm → ReLU → MaxPool
x = Conv1D(filters=32, kernel_size=5, padding="same")(input_layer)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = MaxPooling1D(pool_size=2)(x)

# Block 2
x = Conv1D(filters=64, kernel_size=5, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = MaxPooling1D(pool_size=2)(x)

# Block 3
x = Conv1D(filters=128, kernel_size=5, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = MaxPooling1D(pool_size=2)(x)

# Global pooling → Dense layers → Output
x = GlobalMaxPool1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.3)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.2)(x)

output_layer = Dense(4, activation="linear", name="jet_coordinates")(x)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="mse",
    metrics=["mae"]
)

model.summary()

# -------------------
# 4) Train the model
# -------------------

early_stop = EarlyStopping(
    monitor="val_mae",
    patience=5,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train, y_train,
    validation_split=0.15,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# ----------------------
# 5) Evaluate on test
# ----------------------

# Predict on the held-out test set
y_pred = model.predict(X_test)

# Compute regression metrics
test_mse = mean_squared_error(y_test, y_pred)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2  = r2_score(y_test, y_pred)

print("\n--- TEST SET METRICS ---")
print(f"Test MSE: {test_mse:.6f}")
print(f"Test MAE: {test_mae:.6f}")
print(f"Test R² : {test_r2:.4f}")

# If you want per-coordinate errors, you can also do:
mse_per_coord = mean_squared_error(y_test, y_pred, multioutput="raw_values")
mae_per_coord = mean_absolute_error(y_test, y_pred, multioutput="raw_values")
print("\nMSE per coordinate (jet1_x, jet1_y, jet2_x, jet2_y):", mse_per_coord)
print("MAE per coordinate (jet1_x, jet1_y, jet2_x, jet2_y):", mae_per_coord)


Training samples: 12000
Test samples:     3000
Input tensor shape: (12000, 208, 2)
Output tensor shape: (12000, 4)


Epoch 1/100
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - loss: 367413440.0000 - mae: 11632.2803 - val_loss: 229696304.0000 - val_mae: 8940.7891
Epoch 2/100
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 222545552.0000 - mae: 8769.2129 - val_loss: 219542208.0000 - val_mae: 8695.5898
Epoch 3/100
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - loss: 221875360.0000 - mae: 8700.5293 - val_loss: 219831968.0000 - val_mae: 8677.6416
Epoch 4/100
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - loss: 217783360.0000 - mae: 8589.8906 - val_loss: 224744784.0000 - val_mae: 8737.2109
Epoch 5/100
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 213273728.0000 - mae: 8487.0400 - val_loss: 218935728.0000 - val_mae: 8655.2676
Epoch 6/100
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 216653840.0000 - mae: 8