In [11]:
import os
import glob
import string
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from ml2en import ml2en # Import the Malayalam to Manglish library

# Import the LipFormer model from your script
from my_model import LipFormer

In [13]:
# --- 1. Configuration ---
CONFIG = {
    "data": {
        "landmarks": "D:/ADARSH/extracted_landmarks_model_ready",
        "lip_rois": "D:/ADARSH/extracted_lip_crosssection",
        "transcripts": "D:/ADARSH/transcripts",
    },
    "checkpoint_dir": "checkpoints",
    "epochs": 1, # Increased epochs for meaningful training
    "batch_size": 1, # You can try 1 if memory issues persist
    "learning_rate": 1e-4,
    "teacher_forcing_ratio": 0.5,
    "lambda_val": 0.7,
    "image_size": (80, 160),
    "validation_split": 0.1, # 10% of data for validation
}


2. Vocabulary Definitions 

In [14]:
# --- Vocabulary for Manglish ---
MANGLISH_PAD_TOKEN = 0
MANGLISH_SOS_TOKEN = 1
MANGLISH_EOS_TOKEN = 2
MANGLISH_UNK_TOKEN = 3
MANGLISH_CHARS = string.ascii_lowercase + string.digits + " .'-"
manglish_to_int = {char: i + 4 for i, char in enumerate(MANGLISH_CHARS)}
manglish_to_int["<pad>"] = MANGLISH_PAD_TOKEN
manglish_to_int["<sos>"] = MANGLISH_SOS_TOKEN
manglish_to_int["<eos>"] = MANGLISH_EOS_TOKEN
manglish_to_int["<unk>"] = MANGLISH_UNK_TOKEN
int_to_manglish = {i: char for char, i in manglish_to_int.items()}
MANGLISH_VOCAB_SIZE = len(manglish_to_int)

# --- Vocabulary for Malayalam ---
MALAYALAM_PAD_TOKEN = 0
MALAYALAM_SOS_TOKEN = 1
MALAYALAM_EOS_TOKEN = 2
MALAYALAM_UNK_TOKEN = 3
malayalam_to_int = {
    "<pad>": MALAYALAM_PAD_TOKEN,
    "<sos>": MALAYALAM_SOS_TOKEN,
    "<eos>": MALAYALAM_EOS_TOKEN,
    "<unk>": MALAYALAM_UNK_TOKEN,
}
int_to_malayalam = {}

In [15]:
def build_malayalam_vocab(transcript_dir):
    """Scans all transcript files to build the Malayalam character vocabulary."""
    vocab = set()
    transcript_files = glob.glob(os.path.join(transcript_dir, "*.txt"))
    for file_path in tqdm(transcript_files, desc="Building Malayalam Vocab"):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = [line.strip().split() for line in f.readlines()]
        full_text = " ".join([parts[-1] for parts in lines if len(parts) > 2])
        vocab.update(list(full_text))
    
    for i, char in enumerate(sorted(list(vocab))):
        malayalam_to_int[char] = i + 4
        
    global int_to_malayalam
    int_to_malayalam = {i: char for char, i in malayalam_to_int.items()}
    
    return len(malayalam_to_int)

In [16]:
MALAYALAM_VOCAB_SIZE = build_malayalam_vocab(CONFIG["data"]["transcripts"])

Building Malayalam Vocab:   0%|          | 0/4385 [00:00<?, ?it/s]

Building Malayalam Vocab: 100%|██████████| 4385/4385 [00:14<00:00, 292.35it/s]


In [18]:
int_to_malayalam

{0: '<pad>',
 1: '<sos>',
 2: '<eos>',
 3: '<unk>',
 4: ' ',
 5: ':',
 6: 'ം',
 7: 'അ',
 8: 'ആ',
 9: 'ഇ',
 10: 'ഈ',
 11: 'ഉ',
 12: 'ഊ',
 13: 'ഋ',
 14: 'എ',
 15: 'ഏ',
 16: 'ഐ',
 17: 'ഒ',
 18: 'ഓ',
 19: 'ഔ',
 20: 'ക',
 21: 'ഖ',
 22: 'ഗ',
 23: 'ഘ',
 24: 'ങ',
 25: 'ച',
 26: 'ഛ',
 27: 'ജ',
 28: 'ഞ',
 29: 'ട',
 30: 'ഠ',
 31: 'ഡ',
 32: 'ണ',
 33: 'ത',
 34: 'ഥ',
 35: 'ദ',
 36: 'ധ',
 37: 'ന',
 38: 'പ',
 39: 'ഫ',
 40: 'ബ',
 41: 'ഭ',
 42: 'മ',
 43: 'യ',
 44: 'ര',
 45: 'റ',
 46: 'ല',
 47: 'ള',
 48: 'ഴ',
 49: 'വ',
 50: 'ശ',
 51: 'ഷ',
 52: 'സ',
 53: 'ഹ',
 54: 'ാ',
 55: 'ി',
 56: 'ീ',
 57: 'ു',
 58: 'ൂ',
 59: 'ൃ',
 60: 'െ',
 61: 'േ',
 62: 'ൈ',
 63: 'ൊ',
 64: 'ോ',
 65: 'ൌ',
 66: '്',
 67: 'ൗ',
 68: 'ൺ',
 69: 'ൻ',
 70: 'ർ',
 71: 'ൽ',
 72: 'ൾ',
 73: '\u200d',
 74: '萍'}

In [17]:
malayalam_to_int

{'<pad>': 0,
 '<sos>': 1,
 '<eos>': 2,
 '<unk>': 3,
 ' ': 4,
 ':': 5,
 'ം': 6,
 'അ': 7,
 'ആ': 8,
 'ഇ': 9,
 'ഈ': 10,
 'ഉ': 11,
 'ഊ': 12,
 'ഋ': 13,
 'എ': 14,
 'ഏ': 15,
 'ഐ': 16,
 'ഒ': 17,
 'ഓ': 18,
 'ഔ': 19,
 'ക': 20,
 'ഖ': 21,
 'ഗ': 22,
 'ഘ': 23,
 'ങ': 24,
 'ച': 25,
 'ഛ': 26,
 'ജ': 27,
 'ഞ': 28,
 'ട': 29,
 'ഠ': 30,
 'ഡ': 31,
 'ണ': 32,
 'ത': 33,
 'ഥ': 34,
 'ദ': 35,
 'ധ': 36,
 'ന': 37,
 'പ': 38,
 'ഫ': 39,
 'ബ': 40,
 'ഭ': 41,
 'മ': 42,
 'യ': 43,
 'ര': 44,
 'റ': 45,
 'ല': 46,
 'ള': 47,
 'ഴ': 48,
 'വ': 49,
 'ശ': 50,
 'ഷ': 51,
 'സ': 52,
 'ഹ': 53,
 'ാ': 54,
 'ി': 55,
 'ീ': 56,
 'ു': 57,
 'ൂ': 58,
 'ൃ': 59,
 'െ': 60,
 'േ': 61,
 'ൈ': 62,
 'ൊ': 63,
 'ോ': 64,
 'ൌ': 65,
 '്': 66,
 'ൗ': 67,
 'ൺ': 68,
 'ൻ': 69,
 'ർ': 70,
 'ൽ': 71,
 'ൾ': 72,
 '\u200d': 73,
 '萍': 74}

In [9]:
manglish_to_int

{'a': 4,
 'b': 5,
 'c': 6,
 'd': 7,
 'e': 8,
 'f': 9,
 'g': 10,
 'h': 11,
 'i': 12,
 'j': 13,
 'k': 14,
 'l': 15,
 'm': 16,
 'n': 17,
 'o': 18,
 'p': 19,
 'q': 20,
 'r': 21,
 's': 22,
 't': 23,
 'u': 24,
 'v': 25,
 'w': 26,
 'x': 27,
 'y': 28,
 'z': 29,
 '0': 30,
 '1': 31,
 '2': 32,
 '3': 33,
 '4': 34,
 '5': 35,
 '6': 36,
 '7': 37,
 '8': 38,
 '9': 39,
 ' ': 40,
 '.': 41,
 "'": 42,
 '-': 43,
 '<pad>': 0,
 '<sos>': 1,
 '<eos>': 2,
 '<unk>': 3}