In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [2]:
# Define parameters
num_samples = 1000
sequence_length = 100  # Length of DNA sequence
num_classes = 4  # A=0, C=1, G=2, T=3
dna_mapping = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}

In [3]:
# Generate synthetic parental DNA sequences
parent1_data = np.random.randint(0, 4, size=(num_samples, sequence_length), dtype=np.int32)
parent2_data = np.random.randint(0, 4, size=(num_samples, sequence_length), dtype=np.int32)

print(parent1_data,parent2_data)

[[0 3 1 ... 2 3 2]
 [2 1 0 ... 1 2 3]
 [3 1 2 ... 2 3 1]
 ...
 [2 0 3 ... 2 1 1]
 [2 0 0 ... 0 3 0]
 [2 3 3 ... 3 1 2]] [[2 3 1 ... 0 1 2]
 [1 0 0 ... 3 1 1]
 [1 3 1 ... 3 2 1]
 ...
 [0 0 0 ... 1 2 2]
 [1 3 0 ... 0 2 2]
 [3 0 3 ... 1 1 0]]


In [4]:
# Simulate child DNA sequences (taking random mix from parents)
child_data = np.where(np.random.rand(num_samples, sequence_length) > 0.5, parent1_data, parent2_data)
print(child_data)


[[0 3 1 ... 0 1 2]
 [2 0 0 ... 3 1 3]
 [1 1 1 ... 3 2 1]
 ...
 [2 0 0 ... 1 2 2]
 [2 0 0 ... 0 3 2]
 [3 3 3 ... 1 1 2]]


In [5]:
# Convert to one-hot encoding
parent1_data = tf.one_hot(parent1_data, depth=num_classes, dtype=tf.float32)
parent2_data = tf.one_hot(parent2_data, depth=num_classes, dtype=tf.float32)
child_data = tf.one_hot(child_data, depth=num_classes, dtype=tf.float32)
print(parent1_data,parent2_data, child_data)

tf.Tensor(
[[[1. 0. 0. 0.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]
  ...
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 0. 1. 0.]]

 [[0. 0. 1. 0.]
  [0. 1. 0. 0.]
  [1. 0. 0. 0.]
  ...
  [0. 1. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]]

 [[0. 0. 0. 1.]
  [0. 1. 0. 0.]
  [0. 0. 1. 0.]
  ...
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]]

 ...

 [[0. 0. 1. 0.]
  [1. 0. 0. 0.]
  [0. 0. 0. 1.]
  ...
  [0. 0. 1. 0.]
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]]

 [[0. 0. 1. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  ...
  [1. 0. 0. 0.]
  [0. 0. 0. 1.]
  [1. 0. 0. 0.]]

 [[0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 0. 0. 1.]
  ...
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]
  [0. 0. 1. 0.]]], shape=(1000, 100, 4), dtype=float32) tf.Tensor(
[[[0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]
  ...
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]
  [0. 0. 1. 0.]]

 [[0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  ...
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]]

 [[0. 1. 0. 0.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]
  ...
  [0. 0. 0. 1.]
  [0. 0. 1. 0.]
  [0. 1. 0. 0.]]

In [6]:
# Concatenate parent data as input
x_data = tf.concat([parent1_data, parent2_data], axis=-1)
print(x_data)

tf.Tensor(
[[[1. 0. 0. ... 0. 1. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 1. 0. ... 1. 0. 0.]
  ...
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 1. ... 0. 1. 0.]]

 [[0. 0. 1. ... 1. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  ...
  [0. 1. 0. ... 0. 0. 1.]
  [0. 0. 1. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]]

 [[0. 0. 0. ... 1. 0. 0.]
  [0. 1. 0. ... 0. 0. 1.]
  [0. 0. 1. ... 1. 0. 0.]
  ...
  [0. 0. 1. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 1. 0.]
  [0. 1. 0. ... 1. 0. 0.]]

 ...

 [[0. 0. 1. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 1. ... 1. 0. 0.]
  [0. 1. 0. ... 0. 1. 0.]
  [0. 1. 0. ... 0. 1. 0.]]

 [[0. 0. 1. ... 1. 0. 0.]
  [1. 0. 0. ... 0. 0. 1.]
  [1. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 1. 0.]
  [1. 0. 0. ... 0. 1. 0.]]

 [[0. 0. 1. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 1. 0. 0.]
  [0. 1. 0. ... 1. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]]]

In [7]:
# Split dataset into training and testing sets
split = int(0.8 * num_samples)
x_train, x_test = x_data[:split], x_data[split:]
y_train, y_test = child_data[:split], child_data[split:]

In [8]:
# Build the deep learning model
model = keras.Sequential([
    layers.Input(shape=(sequence_length, 8)),  # 4 channels each from 2 parents
    layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dense(sequence_length * num_classes, activation='softmax'),
    layers.Reshape((sequence_length, num_classes))  # Reshape to match output format
])

In [9]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [10]:
# Train the model
model.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.2510 - loss: 1.3904 - val_accuracy: 0.2471 - val_loss: 1.3866
Epoch 2/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.2718 - loss: 1.3846 - val_accuracy: 0.2546 - val_loss: 1.3873
Epoch 3/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2890 - loss: 1.3799 - val_accuracy: 0.2628 - val_loss: 1.3878
Epoch 4/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.3119 - loss: 1.3708 - val_accuracy: 0.2687 - val_loss: 1.3866
Epoch 5/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.3519 - loss: 1.3473 - val_accuracy: 0.2887 - val_loss: 1.3808
Epoch 6/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.3949 - loss: 1.3069 - val_accuracy: 0.3017 - val_loss: 1.3792
Epoch 7/100
[1m25/25[0m [

<keras.src.callbacks.history.History at 0x286060c8f20>

In [11]:
# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.3464 - loss: 6.9145
Test Accuracy: 0.3489


In [12]:
# Predict a child's DNA based on given parents
sample_parent1 = np.random.randint(0, 4, size=(1, sequence_length), dtype=np.int32)
sample_parent2 = np.random.randint(0, 4, size=(1, sequence_length), dtype=np.int32)
sample_input = tf.concat([tf.one_hot(sample_parent1, depth=num_classes, dtype=tf.float32),
                          tf.one_hot(sample_parent2, depth=num_classes, dtype=tf.float32)], axis=-1)

In [13]:
predicted_child = model.predict(sample_input)
predicted_child_sequence = np.argmax(predicted_child, axis=-1)

# Convert encoded sequence back to DNA bases
decoded_child_dna = "".join([dna_mapping[nuc] for nuc in predicted_child_sequence[0]])
print(f"Predicted Child DNA Sequence: {decoded_child_dna}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Predicted Child DNA Sequence: GAGGGCACAATGCCACACGAGCCTCCCTAATATACGTTTAGGTTGTTGGTTTTAATTCCCGCGCCGGGGAGAGTTGTGTATCCGATCCGGGCCAGCCTGT
