In [1]:
# Importing the libraries for video classification
import torch, os, cv2
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchvision import transforms
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from PIL import Image
from lipreading.model import Lipreading
from lipreading.optim_utils import CosineScheduler

# 2. Initialize the seed and the device

In [2]:
# Setting the seed for reproducibility
seed = 0
def reset_seed():
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Setting the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 3. Dataset preparation

## 3.1. List of Classes

In [3]:
def extract_label(file):
    label = []
    diacritics = {
        '\u064B',  # Fathatan
        '\u064C',  # Dammatan
        '\u064D',  # Kasratan
        '\u064E',  # Fatha
        '\u064F',  # Damma
        '\u0650',  # Kasra
        '\u0651',  # Shadda
        '\u0652',  # Sukun
        '\u06E2',  # Small High meem
    }

    sentence = pd.read_csv(file)
    for word in sentence.word:
        for char in word:
            if char not in diacritics:
                label.append(char)
            else:
                label[-1] += char

    return label

classes = set()
for i in os.listdir('Dataset/Csv (with Diacritics)'):
    file = 'Dataset/Csv (with Diacritics)/' + i
    label = extract_label(file)
    classes.update(label)

mapped_classes = {}
for i, c in enumerate(sorted(classes, reverse=True), 1):
    mapped_classes[c] = i

print(mapped_classes)

{'ٱ': 1, 'يْ': 2, 'يّْ': 3, 'يِّ': 4, 'يُّ': 5, 'يَّ': 6, 'يٌّ': 7, 'يِ': 8, 'يُ': 9, 'يَ': 10, 'يٌ': 11, 'ي': 12, 'ى': 13, 'وْ': 14, 'وِّ': 15, 'وُّ': 16, 'وَّ': 17, 'وِ': 18, 'وُ': 19, 'وَ': 20, 'وً': 21, 'و': 22, 'هْ': 23, 'هُّ': 24, 'هِ': 25, 'هُ': 26, 'هَ': 27, 'نۢ': 28, 'نْ': 29, 'نِّ': 30, 'نُّ': 31, 'نَّ': 32, 'نِ': 33, 'نُ': 34, 'نَ': 35, 'ن': 36, 'مْ': 37, 'مّْ': 38, 'مِّ': 39, 'مُّ': 40, 'مَّ': 41, 'مِ': 42, 'مُ': 43, 'مَ': 44, 'مٍ': 45, 'مٌ': 46, 'مً': 47, 'لْ': 48, 'لّْ': 49, 'لِّ': 50, 'لُّ': 51, 'لَّ': 52, 'لِ': 53, 'لُ': 54, 'لَ': 55, 'لٍ': 56, 'لٌ': 57, 'لً': 58, 'ل': 59, 'كْ': 60, 'كِّ': 61, 'كَّ': 62, 'كِ': 63, 'كُ': 64, 'كَ': 65, 'ك': 66, 'قْ': 67, 'قَّ': 68, 'قِ': 69, 'قُ': 70, 'قَ': 71, 'قٍ': 72, 'قً': 73, 'ق': 74, 'فْ': 75, 'فِّ': 76, 'فَّ': 77, 'فِ': 78, 'فُ': 79, 'فَ': 80, 'غْ': 81, 'غِ': 82, 'غَ': 83, 'عْ': 84, 'عَّ': 85, 'عِ': 86, 'عُ': 87, 'عَ': 88, 'عٍ': 89, 'ظْ': 90, 'ظِّ': 91, 'ظَّ': 92, 'ظِ': 93, 'ظُ': 94, 'ظَ': 95, 'طْ': 96, 'طِّ': 97, 'طَّ': 98, 'طِ': 

## 3.2. Video Dataset Class

In [4]:
# Defining the video dataset class
class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, video_paths, label_paths, transform=None):
        self.video_paths = video_paths
        self.label_paths = label_paths
        self.transform = transform
        
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, index):
        video_path = self.video_paths[index]
        label_path = self.label_paths[index]
        frames = self.load_frames(video_path=video_path)
        label = torch.tensor(list(map(lambda x: mapped_classes[x], extract_label(label_path))))
        input_length = torch.tensor(len(frames), dtype=torch.long)
        label_length = torch.tensor(len(label), dtype=torch.long)
        return frames, input_length, label, label_length
    
    def load_frames(self, video_path):
        frames = []
        video = cv2.VideoCapture(video_path)
        total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        for i in range(total_frames):
            video.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = video.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                frame_pil = Image.fromarray(frame, 'L')
                frames.append(frame_pil)

        if self.transform is not None:
            frames = [self.transform(frame) for frame in frames] 
        frames = torch.stack(frames).permute(1, 0, 2, 3)
        return frames

# Defining the video transform
transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=0.421, std=0.165),
])

## 3.2. Load the dataset

In [5]:
videos_dir = "Dataset/Video"
labels_dir = "Dataset/Csv (with Diacritics)"
videos, labels = [], []
file_names = [file_name[:-4] for file_name in os.listdir(videos_dir)]
for file_name in file_names:
    videos.append(os.path.join(videos_dir, file_name + ".mp4"))
    labels.append(os.path.join(labels_dir, file_name + ".csv"))

## 3.3. Split the dataset

In [6]:
# Split the dataset into training, validation, test sets
X_temp, X_test, y_temp, y_test = train_test_split(videos, labels, test_size=0.1000, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1111, random_state=seed)

## 3.4. DataLoaders

In [7]:
def pad_packed_collate(batch):
    """Pads data and labels with different lengths in the same batch
    """
    data_list, input_lengths, labels_list, label_lengths = zip(*batch)
    c, max_len, h, w = max(data_list, key=lambda x: x.shape[1]).shape

    data = torch.zeros((len(data_list), c, max_len, h, w))
    
    # Only copy up to the actual sequence length
    for idx in range(len(data)):
        data[idx, :, :input_lengths[idx], :, :] = data_list[idx][:, :input_lengths[idx], :, :]
    
    # Flatten labels for CTC loss
    labels_flat = []
    for label_seq in labels_list:
        labels_flat.extend(label_seq)
    labels_flat = torch.LongTensor(labels_flat)
    
    # Convert lengths to tensor
    input_lengths = torch.LongTensor(input_lengths)
    label_lengths = torch.LongTensor(label_lengths)
    return data, input_lengths, labels_flat, label_lengths


# Defining the video dataloaders (train, validation, test)
train_dataset = VideoDataset(X_train, y_train, transform=transforms)
val_dataset = VideoDataset(X_val, y_val, transform=transforms)
test_dataset = VideoDataset(X_test, y_test, transform=transforms)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True, collate_fn=pad_packed_collate)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, pin_memory=True, collate_fn=pad_packed_collate)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True, collate_fn=pad_packed_collate)

# 4. Model

In [8]:
# Add the greedy CTC decoder functions
def greedy_ctc_decoder(logits, blank_index=0):
    """
    Greedy decoding for CTC.
    Assumes logits shape is (T, C) (log probabilities).
    Returns a list of predicted indices (for one sample).
    """
    # Convert to numpy if it's a tensor
    if isinstance(logits, torch.Tensor):
        logits = logits.detach().cpu().numpy()
    
    # Get the highest probability index at each timestep
    indices = np.argmax(logits, axis=1)  # (T,)
    
    # Remove duplicates and blanks
    filtered_indices = []
    prev_idx = -1
    for idx in indices:
        if idx != blank_index and idx != prev_idx:  # Skip blanks and duplicates
            filtered_indices.append(idx)
        prev_idx = idx
    
    return filtered_indices

def indices_to_text(indices, idx2char):
    """
    Converts a list of indices to text using the reverse vocabulary mapping.
    """
    return ''.join([idx2char.get(i, '') for i in indices])

def compute_cer(reference_indices, hypothesis_indices):
    """
    Computes Character Error Rate (CER) directly using token indices.
    Takes raw token indices from our vocabulary (class_mapping.txt) rather than Unicode text.
    
    Returns a tuple of (CER, reference_len, hypothesis_len, edit_distance)
    """
    # Use the indices directly - each index is one token in our vocabulary
    ref_tokens = reference_indices
    hyp_tokens = hypothesis_indices
    
    print(f"Debug - Reference tokens ({len(ref_tokens)} tokens): {ref_tokens}")
    print(f"Debug - Hypothesis tokens ({len(hyp_tokens)} tokens): {hyp_tokens}")
    
    m, n = len(ref_tokens), len(hyp_tokens)
    
    # Initialize the distance matrix
    dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
    
    # Base cases: empty hypothesis or reference
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Fill the distance matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            # If tokens match, no operation needed
            if ref_tokens[i - 1] == hyp_tokens[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                # Minimum of:
                # 1. Substitution: dp[i-1][j-1] + 1
                # 2. Insertion: dp[i][j-1] + 1
                # 3. Deletion: dp[i-1][j] + 1
                dp[i][j] = min(dp[i - 1][j - 1] + 1,  # substitution
                              dp[i][j - 1] + 1,      # insertion
                              dp[i - 1][j] + 1)      # deletion
    
    edit_distance = dp[m][n]
    cer = edit_distance / max(m, 1)  # Avoid division by zero
    
    return cer, edit_distance

In [9]:
# Initializing the hyper-parameters
densetcn_options = {
    'block_config': [3, 3, 3, 3],               # Number of layers in each dense block
    'growth_rate_set': [384, 384, 384, 384],    # Growth rate for each block (must be divisible by len(kernel_size_set))
    'reduced_size': 512,                        # Reduced size between blocks (must be divisible by len(kernel_size_set))
    'kernel_size_set': [3, 5, 7],               # Kernel sizes for multi-scale processing
    'dilation_size_set': [1, 2, 5],             # Dilation rates for increasing receptive field
    'squeeze_excitation': True,                 # Whether to use SE blocks for channel attention
    'dropout': 0.2                              # Dropout rate
}
initial_lr = 3e-4
total_epochs = 80
scheduler = CosineScheduler(initial_lr, total_epochs)

# Build reverse mapping for decoding
idx2char = {v: k for k, v in mapped_classes.items()}
idx2char[0] = ""  # Blank token for CTC

# Initializing the model
model = Lipreading(densetcn_options=densetcn_options, hidden_dim=512, num_classes=len(mapped_classes) + 1, relu_type='prelu').to(device)
print(model)

# Defining the loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=initial_lr)

Lipreading(
  (frontend): Sequential(
    (0): Conv3d(1, 64, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
  )
  (trunk): ResNet(
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): PReLU(num_parameters=64)
        (relu2): PReLU(num_parameters=64)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=

# 5. Training and Evaluation

In [10]:
# Training the model
def train_one_epoch():
    running_loss = 0.0
    model.train()
    ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)
    
    for batch_idx, (inputs, input_lengths, labels_flat, label_lengths) in enumerate(train_loader):
        # Print input shape for debugging
        print(f"Batch {batch_idx+1} - Input shape: {inputs.shape}")
        
        # Move data to device
        inputs = inputs.to(device)
        input_lengths = input_lengths.to(device)
        labels_flat = labels_flat.to(device)
        label_lengths = label_lengths.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass - get sequence logits
        logits = model(inputs, input_lengths)
        output_lengths = torch.full((logits.size(0),), logits.size(1), dtype=torch.long, device=device)

        # Print shape to verify sequence output
        print(f"Batch {batch_idx+1} - Logits shape: {logits.shape}")
        
        # Apply log_softmax for CTC
        log_probs = F.log_softmax(logits, dim=2)  # (B, T, C)
        
        # Prepare for CTC loss - requires (T, B, C) format
        outputs_for_ctc = log_probs.transpose(0, 1)  # from (B, T, C) to (T, B, C)
        
        # Compute CTC loss
        loss = ctc_loss(outputs_for_ctc, labels_flat, output_lengths, label_lengths)
        print(f"Batch {batch_idx+1} - Loss: {loss.item()}")
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    return running_loss / len(train_loader)

def evaluate_model(data_loader):
    model.eval()
    ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)

    # Track statistics
    total_cer = 0
    total_loss = 0
    total_edit_distance = 0
    
    # Process all batches in the test loader
    with torch.no_grad():
        for i, (inputs, input_lengths, labels_flat, label_lengths) in enumerate(data_loader):
            # Move to device
            inputs = inputs.to(device)
            input_lengths = input_lengths.to(device)
            labels_flat = labels_flat.to(device)
            label_lengths = label_lengths.to(device)
            
            # Forward pass through the entire model
            batch_size = inputs.size(0)
            logits = model(inputs, input_lengths)
            
            # Apply log_softmax for CTC
            log_probs = F.log_softmax(logits, dim=2)  # (B, T, C)
            
            # For CTC loss we need (T, N, C) format
            log_probs_ctc = log_probs.transpose(0, 1)  # (T, B, C)
            
            # Make sure output_lengths are not greater than the input lengths
            output_lengths = torch.full((logits.size(0),), logits.size(1), dtype=torch.long, device=device)
            # if output_lengths.max() > input_lengths.min():
            #     scale_factor = input_lengths.min().float() / output_lengths.max().float()
            #     output_lengths = (output_lengths.float() * scale_factor).long()
            
            # Calculate CTC loss
            loss = ctc_loss(log_probs_ctc, labels_flat, output_lengths, label_lengths)
            
            # Decode predictions - we convert back to numpy for greedy decoding
            logits_np = log_probs.cpu().detach().numpy()  # (B, T, C)
            
            # Process each batch item
            for b in range(batch_size):
                # Get batch item logits
                batch_logits = logits_np[b]  # (T, C)
                
                # Decode using CTC
                pred_indices = greedy_ctc_decoder(batch_logits)
                
                # Get target indices
                start_idx = sum(label_lengths[:b].cpu().tolist()) if b > 0 else 0
                end_idx = start_idx + label_lengths[b].item()
                target_idx = labels_flat[start_idx:end_idx].cpu().numpy()
                
                # Convert indices to text
                pred_text = indices_to_text(pred_indices, idx2char)
                target_text = indices_to_text(target_idx, idx2char)
                
                # Compute CER directly using token indices
                cer, edit_distance = compute_cer(target_idx, pred_indices)
                
                # Update statistics
                total_cer += cer
                total_loss += loss.item()
                total_edit_distance += edit_distance
                
                # Print info
                print("-" * 50)
                print(f"Sample {i * batch_size + b + 1}:")
                print(f"Predicted text: {pred_text}")
                print(f"Target text: {target_text}")
                print(f"Edit Distance: {edit_distance}")
                print(f"CER: {cer:.4f}")
                print(f"CTC Loss: {loss.item():.4f}")
                print("-" * 50)
        
        # Write summary statistics
        n_samples = len(data_loader.dataset)
        avg_cer = total_cer / n_samples
        avg_loss = total_loss / n_samples
        avg_edit_distance = total_edit_distance / n_samples
        
        print("=== Summary Statistics ===")
        print(f"Total samples: {n_samples}")
        print(f"Average CER: {avg_cer:.4f}")
        print(f"Average Edit Distance: {avg_edit_distance:.2f}")
        print(f"Average Loss: {avg_loss:.4f}")

    return total_loss / len(data_loader)

In [11]:
def train_model():
    # Train and validate
    for epoch in range(total_epochs):
        train_one_epoch()
        scheduler.adjust_lr(optimizer, epoch)
        val_loss = evaluate_model(val_loader)
        print(f"Epoch {epoch + 1}/{total_epochs}, Val Loss: {val_loss:.4f}")

In [None]:
reset_seed()
train_model()

Batch 1 - Input shape: torch.Size([32, 1, 31, 128, 128])
Batch 1 - Logits shape: torch.Size([32, 31, 238])
Batch 1 - Loss: 10.8089599609375
Batch 2 - Input shape: torch.Size([32, 1, 37, 128, 128])
Batch 2 - Logits shape: torch.Size([32, 37, 238])
Batch 2 - Loss: 12.21811294555664
Batch 3 - Input shape: torch.Size([32, 1, 37, 128, 128])
Batch 3 - Logits shape: torch.Size([32, 37, 238])
Batch 3 - Loss: 6.976871490478516
Batch 4 - Input shape: torch.Size([32, 1, 37, 128, 128])
Batch 4 - Logits shape: torch.Size([32, 37, 238])
Batch 4 - Loss: 5.949538230895996
Batch 5 - Input shape: torch.Size([32, 1, 37, 128, 128])
Batch 5 - Logits shape: torch.Size([32, 37, 238])
Batch 5 - Loss: 5.508829116821289
Batch 6 - Input shape: torch.Size([32, 1, 37, 128, 128])
Batch 6 - Logits shape: torch.Size([32, 37, 238])
Batch 6 - Loss: 5.248185157775879
Batch 7 - Input shape: torch.Size([32, 1, 37, 128, 128])
Batch 7 - Logits shape: torch.Size([32, 37, 238])
Batch 7 - Loss: 5.169351577758789
Batch 8 - Inpu