In [16]:
!pip install kagglehub
!pip install scikit-learn
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
   ---------------------------------------- 0.0/39.5 MB ? eta -:--:--
   --------- ------------------------------ 9.7/39.5 MB 46.5 MB/s eta 0:00:01
   ---------------------- ----------------- 22.5/39.5 MB 52.8 MB/s eta 0:00:01
   -------------------------------- ------- 32.2/39.5 MB 51.2 MB/s eta 0:00:01
   ---------------------------------------  38.8/39.5 MB 47.5 MB/s eta 0:00:01
   ---------------------------------------- 39.5/39.5 MB 40.5 MB/s eta 0:00:00
Installing collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohamedbentalb/lipreading-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mohamedbentalb/lipreading-dataset?dataset_version_number=1...


100%|██████████| 404M/404M [00:08<00:00, 51.4MB/s] 

Extracting files...





Path to dataset files: C:\Users\6\.cache\kagglehub\datasets\mohamedbentalb\lipreading-dataset\versions\1


In [22]:
import cv2
import torch
import torchvision
import torchvision.transforms as T
import torch_directml

In [25]:
import os
from sklearn.model_selection import train_test_split

video_dir = r"mohamedbentalb\lipreading-dataset\versions\1\data\s1"
align_dir = r"mohamedbentalb\lipreading-dataset\versions\1\data\alignments\s1"

# Pair each video with its alignment file
video_files = [f for f in os.listdir(video_dir) if f.endswith('.mpg')]
video_align_pairs = []
for vf in video_files:
    base = os.path.splitext(vf)[0]
    align_path = os.path.join(align_dir, base + ".align")
    video_path = os.path.join(video_dir, vf)
    if os.path.exists(align_path):
        video_align_pairs.append((video_path, align_path))

print(f"Found {len(video_align_pairs)} video-align pairs.")

train_pairs, test_pairs = train_test_split(video_align_pairs, test_size=0.2, random_state=42)

print(f"Training set size: {len(train_pairs)}")
print(f"Test set size: {len(test_pairs)}")

Found 1000 video-align pairs.
Training set size: 800
Test set size: 200


In [26]:
# Preprocessing transform
transform = T.Compose([
    T.ToPILImage(),
    T.Resize((112, 112)),
    T.ToTensor(),
    T.Normalize([0.43216, 0.394666, 0.37645], [0.22803, 0.22145, 0.216989])
])

def preprocess_frames(frames):
    processed = [transform(frame) for frame in frames]
    video_tensor = torch.stack(processed)  # (T, C, H, W)
    video_tensor = video_tensor.permute(1, 0, 2, 3)  # (C, T, H, W)
    return video_tensor.unsqueeze(0)  # (1, C, T, H, W)

def parse_align_file(align_path):
    alignments = []
    with open(align_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 3:
                start, end, word = parts
                alignments.append((int(start), int(end), word))
    return alignments

def extract_word_frames(video_path, alignments, fps=25):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()

    word_frames = []
    for start, end, word in alignments:
        start_idx = int(start / 1000 * fps)
        end_idx = int(end / 1000 * fps)
        word_seq = frames[start_idx:end_idx]
        if word_seq:
            word_frames.append((word, word_seq))
    return word_frames

In [37]:
import torchvision
import torch_directml
import numpy as np

# device = torch_directml.device()
device = torch.device("cpu")

video_path, align_path = train_pairs[1]
alignments = parse_align_file(align_path)
word_frames = extract_word_frames(video_path, alignments, fps=25)

# Load model
model = torchvision.models.video.r3d_18(pretrained=True)
# model = torchvision.models.video.r2plus1d_18(pretrained=True)
model = model.to(device)
model.eval()

MIN_FRAMES = 16

with torch.no_grad():
    for word, frames in word_frames:
        if len(frames) < MIN_FRAMES:
            continue
        # Sample exactly 16 frames evenly
        if len(frames) > MIN_FRAMES:
            idxs = np.linspace(0, len(frames)-1, MIN_FRAMES).astype(int)
            frames = [frames[i] for i in idxs]
        video_tensor = preprocess_frames(frames).to(device)
        print(video_tensor.shape)
        features = model(video_tensor)
        print(f"Word: {word}, Feature shape: {features.shape}")

torch.Size([1, 3, 16, 112, 112])
Word: sil, Feature shape: torch.Size([1, 400])


Here is another pretrained model: here is the instruction to pip install:

```bash
git clone https://github.com/astorfi/lipnet.git
cd lipnet
pip install -r requirements.txt
pip install .


In [None]:
# You must have LipNet installed and its modules accessible in your path
from lipnet.model import LipNet


# Load LipNet model (set decoder to None to use only encoder)
model = LipNet(img_c=3, img_w=112, img_h=112, absolute_max_string_len=32, output_size=28)
model = model.to(device)
model.eval()

MIN_FRAMES = 75  # LipNet expects 75 frames per input

with torch.no_grad():
    for word, frames in word_frames:
        if len(frames) < MIN_FRAMES:
            continue
        # Sample exactly 75 frames evenly
        if len(frames) > MIN_FRAMES:
            idxs = np.linspace(0, len(frames)-1, MIN_FRAMES).astype(int)
            frames = [frames[i] for i in idxs]
        video_tensor = preprocess_frames(frames).to(dml)
        # Forward pass through encoder (LipNet's forward returns logits, you may want encoder features)
        features = model.encoder(video_tensor)
        print(f"Word: {word}, Feature shape: {features.shape}")