In [305]:
# Importing the libraries for video classification
import torch, os, cv2
from torchvision.models.video.resnet import BasicBlock, Conv3DSimple
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from PIL import Image
from lipreading.model import Lipreading
from lipreading.optim_utils import CosineScheduler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# 2. Initialize the seed and the device

In [306]:
# Setting the seed for reproducibility
seed = 0
def reset_seed():
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Setting the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 3. Dataset preparation

## 3.1. List of Classes

In [307]:
def extract_label(file):
    label = []
    diacritics = {
        '\u064B',  # Fathatan
        '\u064C',  # Dammatan
        '\u064D',  # Kasratan
        '\u064E',  # Fatha
        '\u064F',  # Damma
        '\u0650',  # Kasra
        '\u0651',  # Shadda
        '\u0652',  # Sukun
        '\u06E2',  # Small High meem
    }

    sentence = pd.read_csv(file)
    for word in sentence.word:
        for char in word:
            if char not in diacritics:
                label.append(char)
            else:
                label[-1] += char

    return label

classes = set()
for i in os.listdir('Dataset/Csv (with Diacritics)'):
    file = 'Dataset/Csv (with Diacritics)/' + i
    label = extract_label(file)
    classes.update(label)

mapped_classes = {}
for i, c in enumerate(sorted(classes, reverse=True), 1):
    mapped_classes[c] = i

print(mapped_classes)

{'ٱ': 1, 'يْ': 2, 'يّْ': 3, 'يِّ': 4, 'يُّ': 5, 'يَّ': 6, 'يٌّ': 7, 'يِ': 8, 'يُ': 9, 'يَ': 10, 'يٌ': 11, 'ي': 12, 'ى': 13, 'وْ': 14, 'وِّ': 15, 'وُّ': 16, 'وَّ': 17, 'وِ': 18, 'وُ': 19, 'وَ': 20, 'وً': 21, 'و': 22, 'هْ': 23, 'هُّ': 24, 'هِ': 25, 'هُ': 26, 'هَ': 27, 'نۢ': 28, 'نْ': 29, 'نِّ': 30, 'نُّ': 31, 'نَّ': 32, 'نِ': 33, 'نُ': 34, 'نَ': 35, 'ن': 36, 'مْ': 37, 'مّْ': 38, 'مِّ': 39, 'مُّ': 40, 'مَّ': 41, 'مِ': 42, 'مُ': 43, 'مَ': 44, 'مٍ': 45, 'مٌ': 46, 'مً': 47, 'لْ': 48, 'لّْ': 49, 'لِّ': 50, 'لُّ': 51, 'لَّ': 52, 'لِ': 53, 'لُ': 54, 'لَ': 55, 'لٍ': 56, 'لٌ': 57, 'لً': 58, 'ل': 59, 'كْ': 60, 'كِّ': 61, 'كَّ': 62, 'كِ': 63, 'كُ': 64, 'كَ': 65, 'ك': 66, 'قْ': 67, 'قَّ': 68, 'قِ': 69, 'قُ': 70, 'قَ': 71, 'قٍ': 72, 'قً': 73, 'ق': 74, 'فْ': 75, 'فِّ': 76, 'فَّ': 77, 'فِ': 78, 'فُ': 79, 'فَ': 80, 'غْ': 81, 'غِ': 82, 'غَ': 83, 'عْ': 84, 'عَّ': 85, 'عِ': 86, 'عُ': 87, 'عَ': 88, 'عٍ': 89, 'ظْ': 90, 'ظِّ': 91, 'ظَّ': 92, 'ظِ': 93, 'ظُ': 94, 'ظَ': 95, 'طْ': 96, 'طِّ': 97, 'طَّ': 98, 'طِ': 

## 3.2. Video Dataset Class

In [308]:
# Defining the video dataset class
class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, video_paths, label_paths, transform=None, frame_count=38):
        self.video_paths = video_paths
        self.label_paths = label_paths
        self.transform = transform
        self.frame_count = frame_count
        
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, index):
        video_path = self.video_paths[index]
        label_path = self.label_paths[index]
        frames = self.load_frames(video_path=video_path)
        label = list(map(lambda x: mapped_classes[x], extract_label(label_path)))
        return frames, label
    
    def load_frames(self, video_path):
        frames = []
        video = cv2.VideoCapture(video_path)
        total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        for i in range(total_frames):
            video.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = video.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                frame_pil = Image.fromarray(frame)
                frames.append(frame_pil)

        while len(frames) < self.frame_count:
            frames.append(frames[-1])

        if self.transform is not None:
            frames = [self.transform(frame) for frame in frames]
        frames = torch.stack(frames).permute(1, 0, 2, 3)

        return frames

# Defining the video transform
transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=0.449, std=0.226),
])

## 3.2. Load the dataset

In [309]:
videos_dir = "Dataset/Video"
labels_dir = "Dataset/Csv (with Diacritics)"
videos, labels = [], []
file_names = [file_name[:-4] for file_name in os.listdir(videos_dir)]
for file_name in file_names:
    videos.append(os.path.join(videos_dir, file_name + ".mp4"))
    labels.append(os.path.join(labels_dir, file_name + ".csv"))

## 3.3. Split the dataset

In [310]:
# Split the dataset into training, validation, test sets
X_temp, X_test, y_temp, y_test = train_test_split(videos, labels, test_size=0.10, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.11, random_state=seed)

## 3.4. DataLoaders

In [325]:
def pad_packed_collate(batch):
    """Pads data and labels with different lengths in the same batch
    """
    data_tuple, label_tuple = zip(*batch)

    # Stack video frames
    if len(data_tuple) > 0:
        data = torch.stack(data_tuple)
    else:
        data = torch.empty(0)

    # Process labels - need to handle multiple sequences in a batch
    if len(label_tuple) > 0:
        # Convert each label sequence to a tensor
        label_tensors = []
        for label_seq in label_tuple:
            label_tensors.append(torch.tensor(label_seq, dtype=torch.long))
        
        # Pad sequences to the same length
        labels = pad_sequence(label_tensors, batch_first=True, padding_value=0)
    else:
        labels = torch.empty(0)
    
    return data, labels

# Defining the video dataloaders (train, validation, test)
train_dataset = VideoDataset(X_train, y_train, transform=transforms)
val_dataset = VideoDataset(X_val, y_val, transform=transforms)
test_dataset = VideoDataset(X_test, y_test, transform=transforms)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True, collate_fn=pad_packed_collate)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, pin_memory=True, collate_fn=pad_packed_collate)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True, collate_fn=pad_packed_collate)

mx = 0
for data, labels in train_loader:
    mx = max(mx, labels.size(1))

for data, labels in val_loader:
    mx = max(mx, labels.size(1))

for data, labels in test_loader:
    mx = max(mx, labels.size(1))

mx

24

# 4. Model

In [312]:
# Initializing the hyper-parameters
densetcn_options = {
    'block_config': [3, 3, 3, 3],               # Number of layers in each dense block
    'growth_rate_set': [384, 384, 384, 384],    # Growth rate for each block (must be divisible by len(kernel_size_set))
    'reduced_size': 512,                        # Reduced size between blocks (must be divisible by len(kernel_size_set))
    'kernel_size_set': [3, 5, 7],               # Kernel sizes for multi-scale processing
    'dilation_size_set': [1, 2, 5],             # Dilation rates for increasing receptive field
    'squeeze_excitation': True,                 # Whether to use SE blocks for channel attention
    'dropout': 0.2                              # Dropout rate
}
initial_lr = 3e-4
total_epochs = 80
scheduler = CosineScheduler(initial_lr, total_epochs)

# Initializing the model
model = Lipreading(densetcn_options=densetcn_options).to(device)
print(model)

# Defining the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=initial_lr)

Lipreading(
  (trunk): ResNet(
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): PReLU(num_parameters=64)
        (relu2): PReLU(num_parameters=64)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): PReLU(num_parameters=64)
        (relu2): PReLU(num_parameters=64)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, 

# 5. Training and Evaluation

In [None]:
# Training the model
def train_one_epoch():
    model.train()
    ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        lengths = torch.full((inputs.size(0),), inputs.size(2), dtype=torch.long).to(device)
        optimizer.zero_grad()
        outputs = model(inputs, lengths)
        print(outputs.shape, labels.shape)
        print(outputs, labels)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

def evaluate_model(return_preds=False):
    model.eval()
    criterion = nn.CrossEntropyLoss()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            if return_preds:
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
    return running_loss / len(val_loader), all_preds, all_labels

In [314]:
def train_model():
    # Train and validate
    for epoch in range(total_epochs):
        train_one_epoch()
        scheduler.adjust_lr(optimizer, epoch)
        val_loss, _, _ = evaluate_model()
        print(f"Epoch {epoch+1}/{total_epochs}, Val Loss: {val_loss:.4f}")

In [315]:
train_model()

torch.Size([16, 500]) torch.Size([16, 21])
tensor([[ 6.1449e-01,  5.9690e-01,  5.1144e-01,  ...,  4.3322e-01,
         -8.4315e-01,  9.7986e-01],
        [ 1.1649e-01, -2.4263e-01, -2.8991e-01,  ..., -3.7814e-01,
          8.5668e-01, -7.2118e-01],
        [-6.6158e-01,  1.2242e-01,  2.6624e-01,  ...,  3.0973e-01,
         -2.8001e-01,  1.0593e+00],
        ...,
        [ 4.0884e-01, -6.2515e-01,  6.3304e-01,  ...,  2.5760e-01,
         -4.2663e-04,  3.7911e-01],
        [ 3.2163e-02,  3.3229e-01, -3.2084e-01,  ...,  1.0061e-01,
         -1.4393e-01, -6.1168e-01],
        [-5.9941e-02,  7.1031e-02, -2.5896e-01,  ...,  4.5861e-01,
          4.2672e-01, -1.1116e-01]], device='cuda:0', grad_fn=<AddmmBackward0>) tensor([[232,  84,  55,  35, 193,   1,  48,  18,  55, 219,  10, 219, 198,   1,
          48,  43, 196, 175, 158,   0,   0],
        [ 71, 219,  55, 193,  20,  65, 219,  55, 204,   1,  48, 232,  28, 215,
         219, 235,   1,  48, 224,  44, 219],
        [184,  37,  26,  22, 147, 

RuntimeError: 0D or 1D target tensor expected, multi-target not supported