In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

In [6]:
# --------------------------
# 1. Load and merge datasets
# --------------------------

import pandas as pd

# Load the datasets
sequences = pd.read_csv('train_sequences.csv')
labels = pd.read_csv('train_labels.csv')

# Convert ID column to match target_id format by removing the trailing "_1"
labels['target_id'] = labels['ID'].apply(lambda x: '_'.join(x.split('_')[:2]))

# Merge on matching target_id
merged = pd.merge(sequences, labels, on='target_id')

print("Merged shape:", merged.shape)
print(merged[['target_id', 'sequence', 'x_1', 'y_1', 'z_1']].head())


Merged shape: (137095, 11)
  target_id                       sequence     x_1        y_1     z_1
0    1SCL_A  GGGUGCUCAGUACGAGAGGAACCGCACCC  13.760 -25.974001   0.102
1    1SCL_A  GGGUGCUCAGUACGAGAGGAACCGCACCC   9.310 -29.638000   2.669
2    1SCL_A  GGGUGCUCAGUACGAGAGGAACCGCACCC   5.529 -27.813000   5.878
3    1SCL_A  GGGUGCUCAGUACGAGAGGAACCGCACCC   2.678 -24.900999   9.793
4    1SCL_A  GGGUGCUCAGUACGAGAGGAACCGCACCC   1.827 -20.136000  11.793


In [9]:
# -------------------------------
# 2. Sequence preprocessing (1-hot)
# -------------------------------

from sklearn.preprocessing import MinMaxScaler

# Define allowed nucleotides
valid_nucleotides = {'A', 'U', 'G', 'C'}
merged = merged[merged['sequence'].apply(lambda seq: set(seq).issubset(valid_nucleotides))]

# One-hot encode sequences
def one_hot_encode(seq, max_len=100):
    mapping = {'A': 0, 'U': 1, 'G': 2, 'C': 3}
    arr = np.zeros((max_len, 4))
    for i, nucleotide in enumerate(seq[:max_len]):
        if nucleotide in mapping:
            arr[i, mapping[nucleotide]] = 1
    return arr

# Create feature and label arrays
X = np.stack(merged['sequence'].apply(lambda x: one_hot_encode(x)))
y = merged[['x_1', 'y_1', 'z_1']].values

# Normalize targets
scaler = MinMaxScaler()
y_scaled = scaler.fit_transform(y)

In [10]:
# --------------------------
# 3. Split data
# --------------------------

X_train, X_val, y_train, y_val = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

In [11]:
# --------------------------
# 4. Build ResNet-style model
# --------------------------

def residual_block(x, filters, kernel_size=3):
    shortcut = x
    x = layers.Conv1D(filters, kernel_size, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(filters, kernel_size, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.add([x, shortcut])
    x = layers.Activation('relu')(x)
    return x

input_layer = layers.Input(shape=(100, len(NUCLEOTIDES)))
x = layers.Conv1D(64, 7, padding='same', activation='relu')(input_layer)
x = layers.MaxPooling1D(pool_size=2)(x)

# Add several residual blocks
for _ in range(4):
    x = residual_block(x, 64)

x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(128, activation='relu')(x)
output = layers.Dense(3)(x)  # Predict x_1, y_1, z_1

model = models.Model(inputs=input_layer, outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

In [12]:
# --------------------------
# 5. Train the model
# --------------------------

history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=30, batch_size=32)



Epoch 1/30
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 57ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 2/30
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 52ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 3/30
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 52ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 4/30
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 54ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 5/30
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 55ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 6/30
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 52ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 7/30
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 55ms/step - loss: nan - mae: nan - val_loss: nan 

In [13]:
# --------------------------
# 6. Evaluate
# --------------------------

loss, mae = model.evaluate(X_val, y_val)
print(f"Validation MAE (original scale): {np.mean(np.abs(scaler.inverse_transform(model.predict(X_val)) - scaler.inverse_transform(y_val)))}")

[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 14ms/step - loss: nan - mae: nan
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step
Validation MAE (original scale): nan


In [17]:
print("First few predictions (y_pred):", y_pred[:10])


First few predictions (y_pred): [[nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]]


In [18]:
print("NaNs in X_val:", np.isnan(X_val).sum())


NaNs in X_val: 0


In [20]:
output_layer = model.layers[-1]
print("Output layer activation:", output_layer.get_config())


Output layer activation: {'name': 'dense_1', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None}, 'units': 3, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}


In [21]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])


In [22]:
# Test a small batch
sample_input = X_val[:5]  # Taking first 5 samples
sample_output = model.predict(sample_input)
print(sample_output)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[[nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]]


In [23]:
# Ensure scaling was applied properly
print("Sample of X_val after scaling:", X_val[:5])


Sample of X_val after scaling: [[[0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 0. 1. 0.]
  ...
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 0. 1.]]

 [[0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 1. 0. 0.]
  ...
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 0. 0. 1.]
  ...
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  ...
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 1. 0. 0.]
  ...
  [0. 0. 1. 0.]
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]]]


In [24]:
# Predict without scaling
y_pred_no_scaling = model.predict(X_val)
print("Predictions without scaling:", y_pred_no_scaling[:5])


[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step
Predictions without scaling: [[nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]
 [nan nan nan]]


In [25]:
# Check NaNs and Infs in y_train
print("NaNs in y_train:", np.isnan(y_train).sum())
print("Infs in y_train:", np.isinf(y_train).sum())

# Check label distribution
print("Max y_train:", np.max(y_train))
print("Min y_train:", np.min(y_train))
print("y_train shape:", y_train.shape)


NaNs in y_train: 14418
Infs in y_train: 0
Max y_train: nan
Min y_train: nan
y_train shape: (109372, 3)


In [26]:
# Remove rows with NaNs in y_train
valid_mask = ~np.isnan(y_train).any(axis=1)
X_train_clean = X_train[valid_mask]
y_train_clean = y_train[valid_mask]


In [27]:
valid_mask_val = ~np.isnan(y_val).any(axis=1)
X_val_clean = X_val[valid_mask_val]
y_val_clean = y_val[valid_mask_val]


In [28]:
from keras.optimizers import Adam

model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss='mse',
    metrics=['mae']
)

history = model.fit(
    X_train_clean, y_train_clean,
    validation_data=(X_val_clean, y_val_clean),
    epochs=30,
    batch_size=64
)


Epoch 1/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 94ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 2/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 93ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 3/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 92ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 4/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 91ms/step - loss: nan - mae: nan - val_loss: nan - val_mae: nan
Epoch 5/30
[1m1030/1634[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m52s[0m 87ms/step - loss: nan - mae: nan

KeyboardInterrupt: 

In [29]:
print("NaNs in y_train_clean:", np.isnan(y_train_clean).sum())
print("NaNs in X_train_clean:", np.isnan(X_train_clean).sum())
print("Max y_train_clean:", np.nanmax(y_train_clean))
print("Min y_train_clean:", np.nanmin(y_train_clean))
print("Shape y_train_clean:", y_train_clean.shape)


NaNs in y_train_clean: 0
NaNs in X_train_clean: 0
Max y_train_clean: 1.0
Min y_train_clean: 0.0
Shape y_train_clean: (104566, 3)


In [30]:
print("NaNs in X_train_clean:", np.isnan(X_train_clean).sum())
print("Infs in X_train_clean:", np.isinf(X_train_clean).sum())
print("Max X_train_clean:", np.nanmax(X_train_clean))
print("Min X_train_clean:", np.nanmin(X_train_clean))


NaNs in X_train_clean: 0
Infs in X_train_clean: 0
Max X_train_clean: 1.0
Min X_train_clean: 0.0


In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Input(shape=(X_train_clean.shape[1], X_train_clean.shape[2])),  # 👈 this replaces input_shape in the first layer
    Conv1D(64, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(3, activation='linear')  # Predicting 3D coordinates
])

model.compile(optimizer=Adam(learning_rate=1e-4), loss='mse', metrics=['mae'])


In [34]:
import numpy as np

print("NaNs in X_val:", np.isnan(X_val).sum())
print("NaNs in y_val:", np.isnan(y_val).sum())
print("Infs in X_val:", np.isinf(X_val).sum())
print("Infs in y_val:", np.isinf(y_val).sum())
print("Max y_val:", np.max(y_val))
print("Min y_val:", np.min(y_val))


NaNs in X_val: 0
NaNs in y_val: 3525
Infs in X_val: 0
Infs in y_val: 0
Max y_val: nan
Min y_val: nan


In [33]:
history = model.fit(
    X_train_clean, y_train_clean,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=128,
    verbose=1
)


Epoch 1/30
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - loss: 0.0490 - mae: 0.1590 - val_loss: nan - val_mae: nan
Epoch 2/30
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - loss: 0.0104 - mae: 0.0789 - val_loss: nan - val_mae: nan
Epoch 3/30
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - loss: 0.0087 - mae: 0.0716 - val_loss: nan - val_mae: nan
Epoch 4/30
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - loss: 0.0076 - mae: 0.0665 - val_loss: nan - val_mae: nan
Epoch 5/30
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - loss: 0.0067 - mae: 0.0620 - val_loss: nan - val_mae: nan
Epoch 6/30
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - loss: 0.0059 - mae: 0.0583 - val_loss: nan - val_mae: nan
Epoch 7/30
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - loss: 0.0053 - mae: 0.0

KeyboardInterrupt: 

In [43]:
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)


X_val shape: (27344, 100, 4)
y_val shape: (27344, 3)


In [44]:
# Step 1: Reshape X_val from (27344, 100, 4) to (27344, 400)
X_val_flattened = X_val.reshape(X_val.shape[0], -1)

# Step 2: Concatenate X_val_flattened and y_val for easier NaN/Inf checking
X_y_val_combined = np.concatenate([X_val_flattened, y_val], axis=1)

# Step 3: Check for NaN or Inf in the combined dataset
valid_rows = np.isfinite(X_y_val_combined).all(axis=1)

# Step 4: Filter out the invalid rows
X_val_clean = X_val_flattened[valid_rows]
y_val_clean = y_val[valid_rows]

# Check the cleaned data shapes
print("Cleaned validation set shape:", X_val_clean.shape, y_val_clean.shape)


Cleaned validation set shape: (26169, 400) (26169, 3)


In [46]:
# Check the shape of the data
print("Shape of X_train_clean:", X_train_clean.shape)
print("Shape of y_train_clean:", y_train_clean.shape)


Shape of X_train_clean: (104566, 100, 4)
Shape of y_train_clean: (104566, 3)


In [49]:
# Reshape the data to match the expected input shape (None, 100, 4)
X_train_clean = X_train_clean.reshape(-1, 100, 4)
X_val_clean = X_val_clean.reshape(-1, 100, 4)

# Now proceed with model training
history = model.fit(
    X_train_clean, y_train_clean,
    validation_data=(X_val_clean, y_val_clean),
    epochs=30,
    batch_size=64
)


Epoch 1/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 0.0030 - mae: 0.0375 - val_loss: 0.0029 - val_mae: 0.0365
Epoch 2/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 0.0029 - mae: 0.0366 - val_loss: 0.0029 - val_mae: 0.0374
Epoch 3/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - loss: 0.0028 - mae: 0.0361 - val_loss: 0.0028 - val_mae: 0.0355
Epoch 4/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0028 - mae: 0.0357 - val_loss: 0.0027 - val_mae: 0.0350
Epoch 5/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0027 - mae: 0.0353 - val_loss: 0.0027 - val_mae: 0.0349
Epoch 6/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0027 - mae: 0.0352 - val_loss: 0.0026 - val_mae: 0.0341
Epoch 7/30
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [50]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on the validation data
y_val_pred = model.predict(X_val_clean)

# Calculate MAE
mae = mean_absolute_error(y_val_clean, y_val_pred)

# Calculate MSE
mse = mean_squared_error(y_val_clean, y_val_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate R²
r2 = r2_score(y_val_clean, y_val_pred)

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


[1m818/818[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Mean Absolute Error (MAE): 0.03327726512096652
Mean Squared Error (MSE): 0.002569209700334332
Root Mean Squared Error (RMSE): 0.05068737219795806
R² Score: 0.747015344685854


In [51]:
# Save the model
model.save('rna_3d_model.h5')


