In [17]:
import boto3
import torch
import os
import tarfile
import numpy as np
from source.lipnet import LipNet

In [2]:
# Step 1: Download the model from S3
s3 = boto3.client('s3')
bucket_name = 'slip-ml'
model_key = 'models/pytorch-training-2025-05-09-14-46-05-805/output/model.tar.gz'
local_model_path = 'model.tar.gz'

In [3]:
s3.download_file(bucket_name, model_key, local_model_path)

In [4]:
# Extract the model file
with tarfile.open(local_model_path, 'r:gz') as tar:
    tar.extractall(path='model')

In [22]:
# Step 2: Load the model
model_path = 'model/model.pth'  # Replace with the actual path of the model file
model = LipNet(img_c=3, img_w=100, img_h=50, frames_n=90)
state_dict = torch.load(model_path)
# Remove "module.module." prefix from keys if present
new_state_dict = {}
for key, value in state_dict.items():
    new_key = key.replace("module.module.", "")  # Remove the prefix
    new_state_dict[new_key] = value

# Load the modified state_dict into the model
model.load_state_dict(new_state_dict)

# Set the model to evaluation mode
model.eval()

Estimated feature map size: 1728


LipNet(
  (conv1): Conv3d(3, 32, kernel_size=(3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
  (pool1): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (dropout1): Dropout(p=0.5, inplace=False)
  (conv2): Conv3d(32, 64, kernel_size=(3, 5, 5), stride=(1, 1, 1), padding=(1, 2, 2))
  (pool2): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (conv3): Conv3d(64, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool3): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (dropout3): Dropout(p=0.5, inplace=False)
  (gru1): GRU(1728, 256, batch_first=True, bidirectional=True)
  (gru2): GRU(512, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=31, bias=True)
)

In [8]:
# Step 3: Download test data from S3
test_data_key = 'data/preprocessing-2/test/videos/_xTCtX0E8H4__numpy__0.npz'
local_test_data_path = 'test_data.npz'

In [9]:
s3.download_file(bucket_name, test_data_key, local_test_data_path)

In [27]:
with np.load(local_test_data_path, allow_pickle=True) as data:
    # Assuming the key for frames in the .npz file is 'frames'
    frames = data['frames'] 

In [28]:
print("Shape of frames before permute:", frames.shape)

Shape of frames before permute: (90, 50, 100, 3)


In [29]:
# Adjust the permute operation based on the shape
if len(frames.shape) == 4:  # Expected shape (T, H, W, C)
    frames = torch.tensor(frames, dtype=torch.float32).permute(3, 0, 1, 2)  # (C, T, H, W)
elif len(frames.shape) == 3:  # Handle case where channel dimension is missing
    frames = torch.tensor(frames, dtype=torch.float32).unsqueeze(0)  # Add channel dimension
else:
    raise ValueError(f"Unexpected shape for frames: {frames.shape}")

In [30]:
print("Shape of frames after permute:", frames.shape)

Shape of frames after permute: torch.Size([3, 90, 50, 100])


In [32]:
test_frames = frames.unsqueeze(0)  # Add batch dimension

In [33]:
# Step 4: Run inference
with torch.no_grad():
    results = model(test_frames)
print("Inference results:", results)

After conv layers - Channels: 96, Height: 3, Width: 6
Inference results: tensor([[[-1.1817, -2.9836, -4.4086,  ..., -7.1071, -6.4708, -6.2199],
         [-0.7410, -3.1898, -4.8516,  ..., -7.0610, -6.8473, -6.7695],
         [-0.5099, -3.4486, -5.1885,  ..., -6.9988, -7.0927, -7.1037],
         ...,
         [-0.8665, -3.5289, -4.7868,  ..., -2.6968, -4.6994, -5.2443],
         [-1.3458, -3.7186, -4.7751,  ..., -1.4195, -3.9312, -4.5409],
         [-2.4514, -4.4250, -5.1537,  ..., -0.5200, -3.4915, -4.2173]]])


In [39]:
def decode_ctc(predictions, idx_to_char, blank=0):
    """
    Decode CTC output using greedy decoding.

    Args:
        predictions (torch.Tensor): Model output of shape (batch_size, T, num_classes).
        idx_to_char (dict): Mapping from character indices to characters.
        blank (int): Index of the blank token.

    Returns:
        list: Decoded text for each sample in the batch.
    """
    decoded_texts = []
    for pred in predictions:
        decoded_sequence = []
        prev_token = None
        for token_idx in pred:
            if token_idx != blank and token_idx != prev_token:  # Remove blanks and duplicates
                decoded_sequence.append(idx_to_char[token_idx.item()])
            prev_token = token_idx
        decoded_texts.append("".join(decoded_sequence))
    return decoded_texts

# Example usage:
# Assuming `results` is the output of the model with shape (batch_size, T, num_classes)
# and `idx_to_char` is a dictionary mapping indices to characters.
idx_to_char = {0: " ", 1: "a", 2: "b", 3: "c", 4: "d", 5: "e", 6: "f", 7: "g", 8: "h", 9: "i", 10: "j", 11: "k", 12: "l", 13: "m", 14: "n", 15: "o", 16: "p", 17: "q", 18: "r", 19: "s", 20: "t", 21: "u", 22: "v", 23: "w", 24: "x", 25: "y", 26: "z", 27: "'", 28: ".", 29: "?", 30: "!"}  # Example mapping

# Get the most likely character indices
predicted_indices = torch.argmax(results, dim=-1)  # Shape: (batch_size, T)

# Decode predictions
decoded_texts = decode_ctc(predicted_indices, idx_to_char)

# Print the decoded texts
for i, text in enumerate(decoded_texts):
    print(f"Sample {i + 1}: {text}")

Sample 1: .


In [41]:
predicted_indices

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 28]])

In [42]:
import torch
import torch.nn.functional as F

# Example input: 1D tensor of labels
labels = torch.tensor([9, 20, 27, 19, 0, 20, 15, 15, 0, 19, 12, 15, 23, 0, 9, 20, 27, 19, 0, 20, 15, 15, 0, 19, 12, 15, 23, 28])

# Number of classes (e.g., 32 for indices 0-31)
num_classes = 32

# One-hot encode the labels
one_hot_labels = F.one_hot(labels, num_classes=num_classes)

# Convert to float tensor if needed (e.g., for compatibility with loss functions)
one_hot_labels = one_hot_labels.float()

print("Original labels:", labels)
print("One-hot encoded labels shape:", one_hot_labels.shape)
print("One-hot encoded labels:\n", one_hot_labels)

Original labels: tensor([ 9, 20, 27, 19,  0, 20, 15, 15,  0, 19, 12, 15, 23,  0,  9, 20, 27, 19,
         0, 20, 15, 15,  0, 19, 12, 15, 23, 28])
One-hot encoded labels shape: torch.Size([28, 32])
One-hot encoded labels:
 tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.