# 🎥 CNN-Encoder + LSTM-Decoder for Video Frame Prediction
This Colab-ready notebook implements a CNN encoder + LSTM decoder model in TensorFlow/Keras to predict future frames from an input video sequence. The Moving MNIST dataset is used as a sample.

**Architecture:**
- CNN Encoder: Extracts spatial features from each input frame.
- LSTM Decoder: Learns temporal dynamics across frames.
- Output: Predicts the next frame(s) in the sequence.

In [1]:
!pip install tensorflow matplotlib scipy



In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models, Input
from tensorflow.keras.datasets import mnist
from scipy.ndimage import zoom

In [5]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models, Input
from tensorflow.keras.datasets import mnist
from scipy.ndimage import zoom

def generate_moving_mnist(seq_length=10, image_size=64):
    (x_train, _), _ = mnist.load_data()
    img = x_train[np.random.choice(len(x_train))]
    # Apply zoom to the image before using it
    img = zoom(img, (image_size / 28, image_size / 28))
    sequence = []
    # Starting position for the image within the larger frame
    x, y = np.random.randint(0, image_size - img.shape[0], size=2)
    dx, dy = np.random.choice([-2, 2], size=2)

    for _ in range(seq_length):
        frame = np.zeros((image_size, image_size), dtype=np.float32)
        # Use the zoomed image's shape for slicing
        frame[x:x+img.shape[0], y:y+img.shape[1]] = img / 255.0
        sequence.append(frame)
        x += dx
        y += dy
        # Use the zoomed image's shape for boundary checks
        if x < 0 or x > image_size - img.shape[0]: dx *= -1
        if y < 0 or y > image_size - img.shape[1]: dy *= -1

    return np.array(sequence)

In [6]:
sequence = generate_moving_mnist(10)
sequence = np.expand_dims(sequence, -1)  # (10, 64, 64, 1)

X_seq = sequence[:5]  # Input: first 5 frames
y_seq = sequence[5:]  # Target: next 5 frames

# Normalize and expand dims for batch training
X_seq = X_seq[np.newaxis, ...]  # (1, 5, 64, 64, 1)
y_seq = y_seq[np.newaxis, ...]

ValueError: high <= 0

In [None]:
# CNN Encoder for each frame
cnn_encoder = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(256, activation='relu')
])

# Apply encoder to each frame
time_steps = 5
encoded_inputs = layers.TimeDistributed(cnn_encoder)(Input(shape=(time_steps, 64, 64, 1)))

# LSTM Decoder
lstm = layers.LSTM(256, return_sequences=True)(encoded_inputs)
decoded = layers.TimeDistributed(layers.Dense(64*64, activation='sigmoid'))(lstm)
decoded = layers.TimeDistributed(layers.Reshape((64, 64, 1)))(decoded)

model = models.Model(inputs=encoded_inputs._keras_history[0].input, outputs=decoded)
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
model.fit(X_seq, y_seq, epochs=100, verbose=1)

In [None]:
pred_seq = model.predict(X_seq)

for i in range(5):
    plt.subplot(2, 5, i + 1)
    plt.imshow(y_seq[0, i, ..., 0], cmap='gray')
    plt.axis('off')
    plt.title('GT')

    plt.subplot(2, 5, i + 6)
    plt.imshow(pred_seq[0, i, ..., 0], cmap='gray')
    plt.axis('off')
    plt.title('Pred')

plt.tight_layout()
plt.show()