In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/numbers/NUMBERS/8_Eight.mp4
/kaggle/input/numbers/NUMBERS/7_Seven.mp4
/kaggle/input/numbers/NUMBERS/5_Five.mp4
/kaggle/input/numbers/NUMBERS/4_Four.mp4
/kaggle/input/numbers/NUMBERS/4G.mp4
/kaggle/input/numbers/NUMBERS/6_Six.mp4
/kaggle/input/numbers/NUMBERS/9_Nine.mp4
/kaggle/input/numbers/NUMBERS/3_Three.mp4
/kaggle/input/numbers/NUMBERS/10_Ten.mp4
/kaggle/input/numbers/NUMBERS/5G.mp4
/kaggle/input/numbers/NUMBERS/3G.mp4
/kaggle/input/numbers/NUMBERS/2_Two.mp4
/kaggle/input/numbers/NUMBERS/1_One.mp4


**EXTRACTING FRAMES FROM VIDEOS**

In [2]:
import cv2
import os

def extract_frames_from_videos(video_files, input_folder, output_folder, frame_rate=1):
    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Process each video file provided in the list
    for i, video_file in enumerate(video_files):
        video_path = os.path.join(input_folder, video_file)
        
        if os.path.exists(video_path):
            # Create a directory for the frames of this video
            video_output_folder = os.path.join(output_folder, f'video_{i+1}')
            os.makedirs(video_output_folder, exist_ok=True)
            
            # Open the video file
            cap = cv2.VideoCapture(video_path)
            count = 0
            success = True
            
            # Extract frames
            while success:
                success, frame = cap.read()
                if success and count % frame_rate == 0:
                    frame_filename = os.path.join(video_output_folder, f'frame_{count}.jpg')
                    cv2.imwrite(frame_filename, frame)
                count += 1
            
            cap.release()
            print(f'Extracted frames from {video_file} to {video_output_folder}')
        else:
            print(f'Video {video_file} not found in the input folder.')

# Example usage
input_folder = '/kaggle/input/numbers/NUMBERS'  # Replace with the path to your folder containing videos
output_folder = '/kaggle/working/'   # Replace with the path where you want to save the frames
frame_rate = 1  # Extract one frame per second, adjust according to your needs

# List of specific video filenames you want to process
video_files = [
    '1_One.mp4',
    '2_Two.mp4',
    '3_Three.mp4',
    '3G.mp4',
    '4_Four.mp4',
    '4G.mp4',
    '5_Five.mp4',
    '5G.mp4',
    '6_Six.mp4',
    '7_Seven.mp4',
    '8_Eight.mp4',
    '9_Nine.mp4',
    '10_Ten.mp4'
]

extract_frames_from_videos(video_files, input_folder, output_folder, frame_rate)


Extracted frames from 1_One.mp4 to /kaggle/working/video_1
Extracted frames from 2_Two.mp4 to /kaggle/working/video_2
Extracted frames from 3_Three.mp4 to /kaggle/working/video_3
Extracted frames from 3G.mp4 to /kaggle/working/video_4
Extracted frames from 4_Four.mp4 to /kaggle/working/video_5
Extracted frames from 4G.mp4 to /kaggle/working/video_6
Extracted frames from 5_Five.mp4 to /kaggle/working/video_7
Extracted frames from 5G.mp4 to /kaggle/working/video_8
Extracted frames from 6_Six.mp4 to /kaggle/working/video_9
Extracted frames from 7_Seven.mp4 to /kaggle/working/video_10
Extracted frames from 8_Eight.mp4 to /kaggle/working/video_11
Extracted frames from 9_Nine.mp4 to /kaggle/working/video_12
Extracted frames from 10_Ten.mp4 to /kaggle/working/video_13


**PRE-PROCESSING THE FRAMES(RE-SIZING)**

In [3]:
import os
import torch
from torchvision import transforms
from PIL import Image

# Define the transformation pipeline
preprocess = transforms.Compose([
    transforms.Resize(128),
    transforms.CenterCrop(128),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def load_and_preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    return preprocess(image)

def load_and_preprocess_all_images(main_directory):
    all_images = []
    for subdir in sorted(os.listdir(main_directory)):
        subdir_path = os.path.join(main_directory, subdir)
        if os.path.isdir(subdir_path):
            for filename in os.listdir(subdir_path):
                if filename.endswith(".jpg") or filename.endswith(".png"):  # Ensure it's an image file
                    image_path = os.path.join(subdir_path, filename)
                    image_tensor = load_and_preprocess_image(image_path)
                    all_images.append(image_tensor)
    
    # Stack all images into a single tensor with shape (batch_size, channels, height, width)
    return torch.stack(all_images)

# Example: Preprocess all frames in all directories
main_directory = '/kaggle/working'  # Main directory containing video_1, video_2, etc.
all_frames_tensor = load_and_preprocess_all_images(main_directory)

# Print the shape of the resulting tensor
print(all_frames_tensor.shape)  # Should be (total_number_of_frames, 3, 224, 224)


torch.Size([1067, 3, 128, 128])


**TRAINING THE ViT MODEL**

In [4]:
# Mapping from video directories to their respective labels
label_mapping = {
    'video_1': 1,
    'video_2': 2,
    'video_3': 3,
    'video_4': 3,
    'video_5': 4,
    'video_6': 4,
    'video_7': 5,
    'video_8': 5,
    'video_9': 6,
    'video_10': 7,
    'video_11': 8,
    'video_12': 9,
    'video_13': 10
}


In [5]:
from torch.utils.data import Dataset, DataLoader

class FrameDataset(Dataset):
    def __init__(self, main_directory, label_mapping, transform=None):
        self.main_directory = main_directory
        self.label_mapping = label_mapping
        self.transform = transform
        self.image_paths = []
        self.labels = []

        for video_dir, label in label_mapping.items():
            video_path = os.path.join(main_directory, video_dir)
            if os.path.exists(video_path):
                for filename in os.listdir(video_path):
                    if filename.endswith(".jpg") or filename.endswith(".png"):  # Ensure it's an image file
                        self.image_paths.append(os.path.join(video_path, filename))
                        self.labels.append(label - 1)  # Subtract 1 for zero-indexed labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# Define the transformation pipeline
transform = transforms.Compose([
    transforms.Resize(128),
    transforms.CenterCrop(128),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Create the dataset
dataset = FrameDataset(main_directory='/kaggle/working', label_mapping=label_mapping, transform=transform)


In [6]:
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [7]:
# Define the DataLoader
batch_size = 4  # Adjust based on your GPU memory capacity
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)

# Example: Iterate through the DataLoader
for images, labels in train_loader:
    images, labels = images.to(device), labels.to(device)
    print(images.shape, labels.shape)  # Should be (batch_size, 3, 224, 224) and (batch_size,)


torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size([4, 3, 128, 128]) torch.Size([4])
torch.Size

In [8]:
pip install einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import torch
import torch.nn as nn
from einops.layers.torch import Rearrange

class Attention(nn.Module):
    def __init__(self, dim, heads=8, dim_head=64, dropout=0.):
        super().__init__()
        inner_dim = dim_head * heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.attend = nn.Softmax(dim=-1)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=self.heads), qkv)
        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
        attn = self.attend(dots)
        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout=0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout),
                FeedForward(dim, mlp_dim, dropout=dropout)
            ]))

    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return x

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim=1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)


In [10]:
model = ViT(
    image_size=128,  # Reduce input image size
    patch_size=16,
    num_classes=10,  # Adjust based on your dataset
    dim=256,         # Reduce the dimensionality
    depth=4,         # Fewer Transformer layers
    heads=4,         # Fewer attention heads
    mlp_dim=512,     # Smaller MLP size
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)


In [11]:
# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer, using AdamW which is well-suited for transformer models
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

# Learning rate scheduler (optional)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)


In [12]:
from torch.utils.data import random_split

# Assume `dataset` is your complete dataset
train_size = int(0.8 * len(dataset))  # 80% for training
val_size = len(dataset) - train_size  # 20% for validation

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [13]:
from torch.utils.data import DataLoader, Dataset

# Assuming `dataset` is your dataset containing all frames and labels
batch_size = 4  # Small batch size to avoid memory issues
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)

# If you have a separate validation set:
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)


In [14]:
device = torch.device("cpu")
model.to(device)


ViT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=16, p2=16)
    (1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (2): Linear(in_features=768, out_features=256, bias=True)
    (3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (transformer): Transformer(
    (layers): ModuleList(
      (0-3): 4 x ModuleList(
        (0): Attention(
          (to_qkv): Linear(in_features=256, out_features=768, bias=False)
          (attend): Softmax(dim=-1)
          (to_out): Sequential(
            (0): Linear(in_features=256, out_features=256, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (1): FeedForward(
          (net): Sequential(
            (0): Linear(in_features=256, out_features=512, bias=True)
            (1): GELU(approximate='none')
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=512, out_

In [15]:
import torch
import torch.nn as nn
from einops.layers.torch import Rearrange

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        # Ensure the positional embedding is correctly sized
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]  # This line now correctly matches the size
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim=1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Re-create the model with the correct settings
model = ViT(
    image_size=128,  # This will create 64 patches
    patch_size=16,
    num_classes=10,  # Adjust based on your dataset
    dim=256,         # Reduce the dimensionality
    depth=4,         # Fewer Transformer layers
    heads=4,         # Fewer attention heads
    mlp_dim=512,     # Smaller MLP size
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)


In [16]:
import torch
import torch.nn as nn
from einops.layers.torch import Rearrange

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        # Calculate the correct number of positional embeddings
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim=1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Re-create the model with the correct settings
model = ViT(
    image_size=256,  # Adjust image size to create 64 patches
    patch_size=16,
    num_classes=10,  # Adjust based on your dataset
    dim=256,         # Reduce the dimensionality
    depth=4,         # Fewer Transformer layers
    heads=4,         # Fewer attention heads
    mlp_dim=512,     # Smaller MLP size
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)


In [17]:
import torch
import torch.nn as nn
from einops.layers.torch import Rearrange

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        # Ensure the image dimensions are divisible by the patch size
        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        # Calculate and set the correct positional embedding size
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]  # Correct size based on n + 1
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim=1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Instantiate the model with the correct settings
model = ViT(
    image_size=128,  # Ensure correct image size
    patch_size=16,
    num_classes=10,  # Adjust based on your dataset
    dim=256,         # Dimensionality of the model
    depth=4,         # Transformer depth
    heads=4,         # Number of attention heads
    mlp_dim=512,     # MLP dimension size
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)


In [18]:
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)  # You can adjust the learning rate as needed

# Optional: Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)  # Decays the learning rate every 7 epochs


In [19]:
import torch
import torch.nn as nn
from einops.layers.torch import Rearrange

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        # Ensure the image dimensions are divisible by the patch size
        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        # Debugging: Print number of patches and patch dimensions
        print(f"Number of patches: {num_patches}")
        print(f"Patch dimensions: {patch_dim}")
        print(f"Expected positional embedding size: {num_patches + 1}")

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        # Calculate and set the correct positional embedding size
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        # Debugging: Print dimensions before concatenation
        print(f"Input dimensions: {x.shape}")
        print(f"Class token dimensions: {self.cls_token.shape}")
        print(f"Positional embedding dimensions: {self.pos_embedding.shape}")
        print(f"Expected sequence length: {n + 1}")

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]  # Ensure the sizes match
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim=1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Instantiate the model with the correct settings
model = ViT(
    image_size=128,  # Ensure correct image size
    patch_size=16,
    num_classes=10,  # Adjust based on your dataset
    dim=256,         # Dimensionality of the model
    depth=4,         # Transformer depth
    heads=4,         # Number of attention heads
    mlp_dim=512,     # MLP dimension size
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)


Number of patches: 64
Patch dimensions: 768
Expected positional embedding size: 65


In [20]:
# Clear CUDA cache (optional but recommended if using GPU)
torch.cuda.empty_cache()

# Re-initialize the model with the correct settings
model = ViT(
    image_size=128,  # Image size
    patch_size=16,
    num_classes=10,  # Number of classes
    dim=256,         # Model dimensionality
    depth=4,         # Transformer depth
    heads=4,         # Number of attention heads
    mlp_dim=512,     # MLP dimension size
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)

# Re-define the optimizer, criterion, and scheduler if necessary
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# Continue with the training loop
# (The training loop code provided earlier remains unchanged)


Number of patches: 64
Patch dimensions: 768
Expected positional embedding size: 65


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from einops.layers.torch import Rearrange

# Clear CUDA cache (if using GPU)
torch.cuda.empty_cache()

# Define the ViT class (same as previously defined)
class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        # Ensure the image dimensions are divisible by the patch size
        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        # Debugging: Print number of patches and patch dimensions
        print(f"Number of patches: {num_patches}")
        print(f"Patch dimensions: {patch_dim}")
        print(f"Expected positional embedding size: {num_patches + 1}")

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        # Calculate and set the correct positional embedding size
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]  # Ensure the sizes match
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim=1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Re-instantiate the model
model = ViT(
    image_size=128,  # Image size
    patch_size=16,
    num_classes=10,  # Number of classes
    dim=256,         # Model dimensionality
    depth=4,         # Transformer depth
    heads=4,         # Number of attention heads
    mlp_dim=512,     # MLP dimension size
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Optional: Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# Training loop code remains the same as before


Number of patches: 64
Patch dimensions: 768
Expected positional embedding size: 65


In [22]:
import torch
import torch.nn as nn
from einops.layers.torch import Rearrange

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        # Ensure the image dimensions are divisible by the patch size
        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        # Calculate and set the correct positional embedding size
        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)
        x = x.mean(dim=1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Re-instantiate the model
model = ViT(
    image_size=128,  # Image size
    patch_size=16,
    num_classes=10,  # Number of classes
    dim=256,         # Model dimensionality
    depth=4,         # Transformer depth
    heads=4,         # Number of attention heads
    mlp_dim=512,     # MLP dimension size
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Optional: Learning rate scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# Training loop remains the same


**final**

In [23]:
import torch
import torch.nn as nn
from einops.layers.torch import Rearrange

class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        # Ensure image dimensions are divisible by patch size
        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width

        print(f"Number of patches: {num_patches}")
        print(f"Patch dimension: {patch_dim}")

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(dim, heads, mlp_dim, dropout, batch_first=True), num_layers=depth
        )

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        # Ensure img is the correct size
        print(f"Input image shape: {img.shape}")
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        print(f"Patch embedding shape: {x.shape}")
        print(f"Positional embedding shape: {self.pos_embedding[:, :(n + 1)].shape}")

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)
        x = x[:, 0]

        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Re-initialize the model with parameters that match your input data
model = SimpleViT(
    image_size=128,  # 128x128 image size for 64 patches
    patch_size=16,
    num_classes=10,
    dim=128,          
    depth=2,          
    heads=4,          
    mlp_dim=256,      
    channels=3,
    dropout=0.1,
    emb_dropout=0.1
).to(device)

# Testing forward pass
for images, labels in train_loader:
    images = torch.nn.functional.interpolate(images, size=(128, 128))  # Ensure images are resized to 128x128
    images, labels = images.to(device), labels.to(device)
    with torch.no_grad():
        outputs = model(images)
        print("Forward pass successful.")
    break


Number of patches: 64
Patch dimension: 768
Input image shape: torch.Size([4, 3, 128, 128])
Patch embedding shape: torch.Size([4, 64, 128])
Positional embedding shape: torch.Size([1, 65, 128])
Forward pass successful.


In [24]:
# Before training, ensure the model is on the GPU
model.to(device)


SimpleViT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=16, p2=16)
    (1): Linear(in_features=768, out_features=128, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (mlp_head): Sequential(
    (0): LayerNorm((128,), eps=1e-

In [25]:
# Number of epochs to train
num_epochs = 10

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)
    
    # Calculate and print epoch loss and accuracy
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = correct_preds / total_preds
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
    
    # Step the scheduler (if using)
    scheduler.step()

# Save the trained model
torch.save(model.state_dict(), 'vit_model.pth')
print('Model saved to vit_model.pth')


Input image shape: torch.Size([4, 3, 128, 128])
Patch embedding shape: torch.Size([4, 64, 128])
Positional embedding shape: torch.Size([1, 65, 128])
Input image shape: torch.Size([4, 3, 128, 128])
Patch embedding shape: torch.Size([4, 64, 128])
Positional embedding shape: torch.Size([1, 65, 128])
Input image shape: torch.Size([4, 3, 128, 128])
Patch embedding shape: torch.Size([4, 64, 128])
Positional embedding shape: torch.Size([1, 65, 128])
Input image shape: torch.Size([4, 3, 128, 128])
Patch embedding shape: torch.Size([4, 64, 128])
Positional embedding shape: torch.Size([1, 65, 128])
Input image shape: torch.Size([4, 3, 128, 128])
Patch embedding shape: torch.Size([4, 64, 128])
Positional embedding shape: torch.Size([1, 65, 128])
Input image shape: torch.Size([4, 3, 128, 128])
Patch embedding shape: torch.Size([4, 64, 128])
Positional embedding shape: torch.Size([1, 65, 128])
Input image shape: torch.Size([4, 3, 128, 128])
Patch embedding shape: torch.Size([4, 64, 128])
Positional

**ACCURACY IMPROVE**

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from einops.layers.torch import Rearrange
from torchvision import transforms

# Define the SimpleViT class with increased capacity
class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels=3, dim_head=64, dropout=0.2, emb_dropout=0.2):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(dim, heads, mlp_dim, dropout, batch_first=True), num_layers=depth
        )

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)
        x = x[:, 0]

        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Data Augmentation
transform = transforms.Compose([
    transforms.Resize(128),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Re-initialize the model with increased capacity
model = SimpleViT(
    image_size=128,
    patch_size=16,
    num_classes=10,
    dim=512,          # Increased model dimensionality
    depth=6,          # Increased Transformer depth
    heads=8,          # More attention heads
    mlp_dim=1024,     # Larger MLP dimension size
    channels=3,
    dropout=0.2,      # Slightly increased dropout
    emb_dropout=0.2
).to(device)

# Define loss function and optimizer with weight decay
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-4)  # Adding weight decay

# Cosine Annealing LR Scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Training Loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0
    
    for images, labels in train_loader:
        images = torch.nn.functional.interpolate(images, size=(128, 128))  # Ensure images are resized to 128x128
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)
    
    # Calculate and print epoch loss and accuracy
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = correct_preds / total_preds
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
    
    # Step the scheduler
    scheduler.step()

# Save the trained model
torch.save(model.state_dict(), 'vit_model_improved.pth')
print('Improved model saved to vit_model_improved.pth')


Epoch 1/10, Loss: 2.0976, Accuracy: 0.2277
Epoch 2/10, Loss: 1.3207, Accuracy: 0.4236
Epoch 3/10, Loss: 1.0565, Accuracy: 0.4995
Epoch 4/10, Loss: 0.8946, Accuracy: 0.5886
Epoch 5/10, Loss: 0.7064, Accuracy: 0.7029
Epoch 6/10, Loss: 0.5877, Accuracy: 0.7685
Epoch 7/10, Loss: 0.3564, Accuracy: 0.8650
Epoch 8/10, Loss: 0.2637, Accuracy: 0.8978
Epoch 9/10, Loss: 0.2115, Accuracy: 0.9288
Epoch 10/10, Loss: 0.1840, Accuracy: 0.9381
Improved model saved to vit_model_improved.pth


In [27]:
import pickle
with open('vit_model_further_improved.pkl', 'wb') as f:
    pickle.dump(model.state_dict(), f)

print('Further improved model saved to vit_model_further_improved.pkl')

Further improved model saved to vit_model_further_improved.pkl


In [28]:
pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.0-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.7/35.7 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading protobuf-4.25.4-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.0-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installat

In [29]:
import torch

# Clear CUDA cache
torch.cuda.empty_cache()


In [31]:
import os
import cv2
import torch
import pickle
import numpy as np
from torchvision import transforms
from PIL import Image
import mediapipe as mp

# Define the ViT model structure (SimpleViT as an example)
class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels=3, dim_head=64, dropout=0.2, emb_dropout=0.2):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(dim, heads, mlp_dim, dropout, batch_first=True), num_layers=depth
        )

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)
        x = x[:, 0]

        return self.mlp_head(x)

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# Load the trained ViT model from the .pkl file
with open('/kaggle/working/vit_model_further_improved.pkl', 'rb') as f:
    state_dict = pickle.load(f)

# Initialize the model
model = SimpleViT(
    image_size=128,  # Ensuring image size is exactly divisible by patch size (e.g., 128x128)
    patch_size=16,
    num_classes=10,  # Assuming your model has 10 classes
    dim=512,
    depth=6,
    heads=8,
    mlp_dim=1024,
    channels=3,
    dropout=0.2,
    emb_dropout=0.2
)
model.load_state_dict(state_dict)
model.eval()

# Define the preprocessing transformation
preprocess = transforms.Compose([
    transforms.Resize((128, 128)),  # Ensure the input size matches expected dimensions
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Initialize MediaPipe components
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)

# Directory containing the extracted frames
frame_directory = '/kaggle/working'  # Adjust based on your actual frame directory

# Iterate over each video directory
for video_dir in os.listdir(frame_directory):
    video_path = os.path.join(frame_directory, video_dir)
    if os.path.isdir(video_path):
        # Create output directory for processed frames
        output_dir = os.path.join(video_path, 'processed_frames')
        os.makedirs(output_dir, exist_ok=True)
        
        # Process each frame in the video directory
        for frame_filename in os.listdir(video_path):
            if frame_filename.endswith(".jpg") or frame_filename.endswith(".png"):
                frame_path = os.path.join(video_path, frame_filename)
                
                # Read the image
                frame = cv2.imread(frame_path)
                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                
                # Convert numpy array (from OpenCV) to PIL Image
                pil_image = Image.fromarray(rgb_frame)

                # Preprocess the PIL image for the ViT model
                input_tensor = preprocess(pil_image)
                input_tensor = input_tensor.unsqueeze(0)  # Add batch dimension
                
                # Predict using ViT model
                with torch.no_grad():
                    output = model(input_tensor)
                predicted_class = output.argmax(dim=1).item()
                
                # Process the frame with MediaPipe (e.g., hand landmarks)
                hand_results = hands.process(rgb_frame)
                
                # Draw MediaPipe results
                if hand_results.multi_hand_landmarks:
                    for hand_landmarks in hand_results.multi_hand_landmarks:
                        for point in mp_hands.HandLandmark:
                            normalized_landmark = hand_landmarks.landmark[point]
                            x = int(normalized_landmark.x * frame.shape[1])
                            y = int(normalized_landmark.y * frame.shape[0])
                            cv2.circle(frame, (x, y), 5, (0, 0, 255), -1)
                
                # Annotate the predicted class on the frame
                cv2.putText(frame, f'Predicted Class: {predicted_class}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
                
                # Save the processed frame
                output_frame_path = os.path.join(output_dir, frame_filename)
                cv2.imwrite(output_frame_path, frame)

print("Processing complete. The frames with ViT predictions and MediaPipe landmarks have been saved.")


W0000 00:00:1724332984.553713     506 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724332984.588040     508 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Processing complete. The frames with ViT predictions and MediaPipe landmarks have been saved.


In [33]:
import cv2
import os

# Mapping from user input (1-10) to the corresponding video directories
label_to_video_mapping = {
    1: "video_1",
    2: "video_2",
    3: ["video_3", "video_4"],
    4: ["video_5", "video_6"],
    5: ["video_7", "video_8"],
    6: "video_9",
    7: "video_10",
    8: "video_11",
    9: "video_12",
    10: "video_13"
}

# Function to display an animation of the sign language corresponding to a number
def display_sign_language(number, frame_directory):
    # Validate the number input
    if number not in label_to_video_mapping:
        print("Number out of range. Please input a number between 1 and 10.")
        return
    
    video_dirs = label_to_video_mapping[number]
    if not isinstance(video_dirs, list):
        video_dirs = [video_dirs]  # Ensure it's always a list for consistency
    
    output_video_path = f'sign_language_{number}.mp4'
    first_frame = None
    output_video = None

    # Loop through the directories corresponding to the number
    for video_dir in video_dirs:
        video_path = os.path.join(frame_directory, video_dir)
        if not os.path.exists(video_path):
            print(f"No frames found for video {video_dir}")
            continue
        
        frame_list = sorted(os.listdir(video_path))
        
        # Assuming all frames are of the same size, we can get the dimensions from the first frame
        first_frame_path = os.path.join(video_path, frame_list[0])
        first_frame = cv2.imread(first_frame_path)
        height, width, _ = first_frame.shape

        if output_video is None:
            output_video = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 10, (width, height))

        # Process each frame in sequence
        for frame_name in frame_list:
            frame_path = os.path.join(video_path, frame_name)
            frame = cv2.imread(frame_path)
            output_video.write(frame)  # Write each frame to the video file

    if output_video:
        output_video.release()
    print(f"Animation for number {number} saved to {output_video_path}.")

# Main code to run the animation
frame_directory = '/kaggle/working'  # Update this path to the directory containing the frames

# Take input from the user
try:
    user_input = int(input("Enter a number between 1 and 10: "))
    display_sign_language(user_input, frame_directory)
except ValueError:
    print("Invalid input. Please enter an integer between 1 and 10.")


Enter a number between 1 and 10:  2


Animation for number 2 saved to sign_language_2.mp4.


In [34]:
import cv2
import mediapipe as mp
import numpy as np
import os

# Initialize MediaPipe for hand and pose detection
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)

# Function to extract key points from a frame
def extract_keypoints_from_frame(frame):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process hand and body keypoints
    hand_results = hands.process(rgb_frame)
    pose_results = pose.process(rgb_frame)
    
    keypoints = {}
    
    # Extract relevant hand keypoints
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            for point in mp_hands.HandLandmark:
                landmark = hand_landmarks.landmark[point]
                keypoints[str(point)] = (int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0]))
    
    # Extract relevant body keypoints
    if pose_results.pose_landmarks:
        for point in [mp_pose.PoseLandmark.NOSE, mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.RIGHT_SHOULDER,
                      mp_pose.PoseLandmark.LEFT_ELBOW, mp_pose.PoseLandmark.RIGHT_ELBOW, mp_pose.PoseLandmark.LEFT_WRIST,
                      mp_pose.PoseLandmark.RIGHT_WRIST]:
            landmark = pose_results.pose_landmarks.landmark[point]
            keypoints[str(point)] = (int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0]))
    
    return keypoints

# Function to draw the stick figure based on key points
def draw_stick_figure(frame, keypoints):
    if str(mp_pose.PoseLandmark.NOSE) in keypoints:
        cv2.circle(frame, keypoints[str(mp_pose.PoseLandmark.NOSE)], 20, (0, 0, 255), -1)

    if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in keypoints and
        str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in keypoints):
        cv2.line(frame, keypoints[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                 keypoints[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)], (255, 255, 255), 5)

    if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in keypoints and
        str(mp_pose.PoseLandmark.LEFT_ELBOW) in keypoints):
        cv2.line(frame, keypoints[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                 keypoints[str(mp_pose.PoseLandmark.LEFT_ELBOW)], (255, 255, 255), 5)
    if (str(mp_pose.PoseLandmark.LEFT_ELBOW) in keypoints and
        str(mp_pose.PoseLandmark.LEFT_WRIST) in keypoints):
        cv2.line(frame, keypoints[str(mp_pose.PoseLandmark.LEFT_ELBOW)],
                 keypoints[str(mp_pose.PoseLandmark.LEFT_WRIST)], (255, 255, 255), 5)

    if (str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in keypoints and
        str(mp_pose.PoseLandmark.RIGHT_ELBOW) in keypoints):
        cv2.line(frame, keypoints[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)],
                 keypoints[str(mp_pose.PoseLandmark.RIGHT_ELBOW)], (255, 255, 255), 5)
    if (str(mp_pose.PoseLandmark.RIGHT_ELBOW) in keypoints and
        str(mp_pose.PoseLandmark.RIGHT_WRIST) in keypoints):
        cv2.line(frame, keypoints[str(mp_pose.PoseLandmark.RIGHT_ELBOW)],
                 keypoints[str(mp_pose.PoseLandmark.RIGHT_WRIST)], (255, 255, 255), 5)

    # Draw additional elements such as facial features if needed, to represent the gestures better
    if str(mp_pose.PoseLandmark.NOSE) in keypoints:
        cv2.circle(frame, keypoints[str(mp_pose.PoseLandmark.NOSE)], 5, (0, 255, 0), -1)  # Example for nose

# Function to create the animation for the given number
def create_sign_language_animation(number, frame_directory):
    label_to_video_mapping = {
        1: "video_1",
        2: "video_2",
        3: ["video_3", "video_4"],
        4: ["video_5", "video_6"],
        5: ["video_7", "video_8"],
        6: "video_9",
        7: "video_10",
        8: "video_11",
        9: "video_12",
        10: "video_13"
    }

    if number not in label_to_video_mapping:
        print("Number out of range. Please input a number between 1 and 10.")
        return
    
    video_dirs = label_to_video_mapping[number]
    if not isinstance(video_dirs, list):
        video_dirs = [video_dirs]
    
    output_video_path = f'sign_language_{number}_stick_figure.mp4'
    output_video = None

    for video_dir in video_dirs:
        video_path = os.path.join(frame_directory, video_dir)
        if not os.path.exists(video_path):
            print(f"No frames found for video {video_dir}")
            continue
        
        frame_list = sorted(os.listdir(video_path))
        
        for frame_name in frame_list:
            frame_path = os.path.join(video_path, frame_name)
            original_frame = cv2.imread(frame_path)
            if original_frame is None:
                print(f"Failed to read frame {frame_name}")
                continue
            
            keypoints = extract_keypoints_from_frame(original_frame)
            
            frame_height, frame_width, _ = original_frame.shape
            stick_figure_frame = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
            
            # Check if keypoints were detected
            if not keypoints:
                print(f"No keypoints detected in frame: {frame_name}")
                continue
            
            draw_stick_figure(stick_figure_frame, keypoints)

            if output_video is None:
                output_video = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 10, (frame_width, frame_height))
            
            output_video.write(stick_figure_frame)

    if output_video:
        output_video.release()
        print(f"Stick figure sign language animation for number {number} saved to {output_video_path}.")
    else:
        print("No video was created due to missing frames or keypoints.")

# Main code to run the animation
frame_directory = '/kaggle/working'  # Update this path to the directory containing the frames

# Take input from the user
try:
    user_input = int(input("Enter a number between 1 and 10: "))
    create_sign_language_animation(user_input, frame_directory)
except ValueError:
    print("Invalid input. Please enter an integer between 1 and 10.")


W0000 00:00:1724333114.111606     514 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333114.139820     512 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333114.266481     516 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333114.300146     516 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Enter a number between 1 and 10:  2


Failed to read frame processed_frames
Stick figure sign language animation for number 2 saved to sign_language_2_stick_figure.mp4.


In [35]:
import cv2
import mediapipe as mp
import numpy as np
import os

# Initialize MediaPipe for hand and pose detection
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)

# Function to extract key points from a frame
def extract_keypoints_from_frame(frame):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process hand and body keypoints
    hand_results = hands.process(rgb_frame)
    pose_results = pose.process(rgb_frame)
    
    keypoints = {}

    # Extract relevant hand keypoints
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            hand_points = []
            for point in mp_hands.HandLandmark:
                landmark = hand_landmarks.landmark[point]
                hand_points.append((int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])))
            keypoints['hand'] = hand_points
    
    # Extract relevant body keypoints
    if pose_results.pose_landmarks:
        body_points = {}
        for point in [mp_pose.PoseLandmark.NOSE, mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.RIGHT_SHOULDER,
                      mp_pose.PoseLandmark.LEFT_ELBOW, mp_pose.PoseLandmark.RIGHT_ELBOW, mp_pose.PoseLandmark.LEFT_WRIST,
                      mp_pose.PoseLandmark.RIGHT_WRIST]:
            landmark = pose_results.pose_landmarks.landmark[point]
            body_points[str(point)] = (int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0]))
        keypoints['body'] = body_points
    
    return keypoints

# Function to draw the stick figure based on key points
def draw_stick_figure(frame, keypoints):
    if 'body' in keypoints:
        body_points = keypoints['body']
        if str(mp_pose.PoseLandmark.NOSE) in body_points:
            cv2.circle(frame, body_points[str(mp_pose.PoseLandmark.NOSE)], 20, (0, 0, 255), -1)

        if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)], (255, 255, 255), 5)

        if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.LEFT_ELBOW) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.LEFT_ELBOW)], (255, 255, 255), 5)
        if (str(mp_pose.PoseLandmark.LEFT_ELBOW) in body_points and
            str(mp_pose.PoseLandmark.LEFT_WRIST) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_ELBOW)],
                     body_points[str(mp_pose.PoseLandmark.LEFT_WRIST)], (255, 255, 255), 5)

        if (str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_ELBOW) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_ELBOW)], (255, 255, 255), 5)
        if (str(mp_pose.PoseLandmark.RIGHT_ELBOW) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_WRIST) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.RIGHT_ELBOW)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_WRIST)], (255, 255, 255), 5)

    if 'hand' in keypoints:
        hand_points = keypoints['hand']
        for i in range(len(hand_points) - 1):
            cv2.line(frame, hand_points[i], hand_points[i + 1], (0, 255, 0), 2)
        for point in hand_points:
            cv2.circle(frame, point, 5, (0, 255, 0), -1)

# Function to create the animation for the given number
def create_sign_language_animation(number, frame_directory):
    label_to_video_mapping = {
        1: "video_1",
        2: "video_2",
        3: ["video_3", "video_4"],
        4: ["video_5", "video_6"],
        5: ["video_7", "video_8"],
        6: "video_9",
        7: "video_10",
        8: "video_11",
        9: "video_12",
        10: "video_13"
    }

    if number not in label_to_video_mapping:
        print("Number out of range. Please input a number between 1 and 10.")
        return
    
    video_dirs = label_to_video_mapping[number]
    if not isinstance(video_dirs, list):
        video_dirs = [video_dirs]
    
    output_video_path = f'sign_language_{number}_stick_figure.mp4'
    output_video = None

    for video_dir in video_dirs:
        video_path = os.path.join(frame_directory, video_dir)
        if not os.path.exists(video_path):
            print(f"No frames found for video {video_dir}")
            continue
        
        frame_list = sorted(os.listdir(video_path))
        
        for frame_name in frame_list:
            frame_path = os.path.join(video_path, frame_name)
            original_frame = cv2.imread(frame_path)
            if original_frame is None:
                print(f"Failed to read frame {frame_name}")
                continue
            
            keypoints = extract_keypoints_from_frame(original_frame)
            
            frame_height, frame_width, _ = original_frame.shape
            stick_figure_frame = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
            
            # Check if keypoints were detected
            if not keypoints:
                print(f"No keypoints detected in frame: {frame_name}")
                continue
            
            draw_stick_figure(stick_figure_frame, keypoints)

            if output_video is None:
                output_video = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 10, (frame_width, frame_height))
            
            output_video.write(stick_figure_frame)

    if output_video:
        output_video.release()
        print(f"Stick figure sign language animation for number {number} saved to {output_video_path}.")
    else:
        print("No video was created due to missing frames or keypoints.")

# Main code to run the animation
frame_directory = '/kaggle/working'  # Update this path to the directory containing the frames

# Take input from the user
try:
    user_input = int(input("Enter a number between 1 and 10: "))
    create_sign_language_animation(user_input, frame_directory)
except ValueError:
    print("Invalid input. Please enter an integer between 1 and 10.")


W0000 00:00:1724333128.314905     520 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333128.343820     521 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333128.479578     524 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333128.505772     524 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Enter a number between 1 and 10:  2


Failed to read frame processed_frames
Stick figure sign language animation for number 2 saved to sign_language_2_stick_figure.mp4.


In [36]:
import cv2
import mediapipe as mp
import numpy as np
import os

# Initialize MediaPipe for hand and pose detection
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)

# Function to extract key points from a frame
def extract_keypoints_from_frame(frame):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process hand and body keypoints
    hand_results = hands.process(rgb_frame)
    pose_results = pose.process(rgb_frame)
    
    keypoints = {}

    # Extract relevant hand keypoints
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            hand_points = []
            for point in mp_hands.HandLandmark:
                landmark = hand_landmarks.landmark[point]
                hand_points.append((int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])))
            keypoints['hand'] = hand_points
    
    # Extract relevant body keypoints
    if pose_results.pose_landmarks:
        body_points = {}
        for point in [mp_pose.PoseLandmark.NOSE, mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.RIGHT_SHOULDER,
                      mp_pose.PoseLandmark.LEFT_ELBOW, mp_pose.PoseLandmark.RIGHT_ELBOW, mp_pose.PoseLandmark.LEFT_WRIST,
                      mp_pose.PoseLandmark.RIGHT_WRIST]:
            landmark = pose_results.pose_landmarks.landmark[point]
            body_points[str(point)] = (int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0]))
        keypoints['body'] = body_points
    
    return keypoints

# Function to draw the stick figure based on key points
def draw_stick_figure(frame, keypoints):
    if 'body' in keypoints:
        body_points = keypoints['body']
        if str(mp_pose.PoseLandmark.NOSE) in body_points:
            cv2.circle(frame, body_points[str(mp_pose.PoseLandmark.NOSE)], 20, (0, 0, 255), -1)

        if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)], (255, 255, 255), 5)

        if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.LEFT_ELBOW) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.LEFT_ELBOW)], (255, 255, 255), 5)
        if (str(mp_pose.PoseLandmark.LEFT_ELBOW) in body_points and
            str(mp_pose.PoseLandmark.LEFT_WRIST) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_ELBOW)],
                     body_points[str(mp_pose.PoseLandmark.LEFT_WRIST)], (255, 255, 255), 5)

        if (str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_ELBOW) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_ELBOW)], (255, 255, 255), 5)
        if (str(mp_pose.PoseLandmark.RIGHT_ELBOW) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_WRIST) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.RIGHT_ELBOW)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_WRIST)], (255, 255, 255), 5)

    if 'hand' in keypoints:
        hand_points = keypoints['hand']
        for i in range(len(hand_points) - 1):
            cv2.line(frame, hand_points[i], hand_points[i + 1], (0, 255, 0), 2)
        for point in hand_points:
            cv2.circle(frame, point, 5, (0, 255, 0), -1)

# Function to create the animation for the given number
def create_sign_language_animation(number, frame_directory):
    label_to_video_mapping = {
        1: "video_1",
        2: "video_2",
        3: ["video_3", "video_4"],
        4: ["video_5", "video_6"],
        5: ["video_7", "video_8"],
        6: "video_9",
        7: "video_10",
        8: "video_11",
        9: "video_12",
        10: "video_13"
    }

    if number not in label_to_video_mapping:
        print("Number out of range. Please input a number between 1 and 10.")
        return
    
    video_dirs = label_to_video_mapping[number]
    if not isinstance(video_dirs, list):
        video_dirs = [video_dirs]
    
    output_video_path = f'sign_language_{number}_stick_figure.mp4'
    output_video = None

    for video_dir in video_dirs:
        video_path = os.path.join(frame_directory, video_dir, 'processed_frames')
        if not os.path.exists(video_path):
            print(f"No frames found for video {video_dir}")
            continue
        
        frame_list = sorted(os.listdir(video_path))
        
        for frame_name in frame_list:
            frame_path = os.path.join(video_path, frame_name)
            original_frame = cv2.imread(frame_path)
            if original_frame is None:
                print(f"Failed to read frame {frame_path}")
                continue
            
            keypoints = extract_keypoints_from_frame(original_frame)
            
            frame_height, frame_width, _ = original_frame.shape
            stick_figure_frame = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
            
            # Check if keypoints were detected
            if not keypoints:
                print(f"No keypoints detected in frame: {frame_name}")
                continue
            
            draw_stick_figure(stick_figure_frame, keypoints)

            if output_video is None:
                output_video = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 10, (frame_width, frame_height))
            
            output_video.write(stick_figure_frame)

    if output_video:
        output_video.release()
        print(f"Stick figure sign language animation for number {number} saved to {output_video_path}.")
    else:
        print("No video was created due to missing frames or keypoints.")

# Main code to run the animation
frame_directory = '/kaggle/working'  # Update this path to the directory containing the frames

# Take input from the user
try:
    user_input = int(input("Enter a number between 1 and 10: "))
    create_sign_language_animation(user_input, frame_directory)
except ValueError:
    print("Invalid input. Please enter an integer between 1 and 10.")


W0000 00:00:1724333139.236359     530 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333139.267372     531 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333139.400454     534 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333139.426856     534 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Enter a number between 1 and 10:  2


Stick figure sign language animation for number 2 saved to sign_language_2_stick_figure.mp4.


**WHITE BACKGROUND**

In [37]:
import cv2
import mediapipe as mp
import numpy as np
import os

# Initialize MediaPipe for hand and pose detection
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)

# Function to extract key points from a frame
def extract_keypoints_from_frame(frame):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process hand and body keypoints
    hand_results = hands.process(rgb_frame)
    pose_results = pose.process(rgb_frame)
    
    keypoints = {}

    # Extract relevant hand keypoints
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            hand_points = []
            for point in mp_hands.HandLandmark:
                landmark = hand_landmarks.landmark[point]
                hand_points.append((int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])))
            keypoints['hand'] = hand_points
    
    # Extract relevant body keypoints
    if pose_results.pose_landmarks:
        body_points = {}
        for point in [mp_pose.PoseLandmark.NOSE, mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.RIGHT_SHOULDER,
                      mp_pose.PoseLandmark.LEFT_ELBOW, mp_pose.PoseLandmark.RIGHT_ELBOW, mp_pose.PoseLandmark.LEFT_WRIST,
                      mp_pose.PoseLandmark.RIGHT_WRIST]:
            landmark = pose_results.pose_landmarks.landmark[point]
            body_points[str(point)] = (int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0]))
        keypoints['body'] = body_points
    
    return keypoints

# Function to draw the stick figure based on key points
def draw_stick_figure(frame, keypoints):
    if 'body' in keypoints:
        body_points = keypoints['body']
        if str(mp_pose.PoseLandmark.NOSE) in body_points:
            cv2.circle(frame, body_points[str(mp_pose.PoseLandmark.NOSE)], 20, (0, 0, 255), -1)

        if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)], (255, 0, 0), 5)

        if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.LEFT_ELBOW) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.LEFT_ELBOW)], (255, 0, 0), 5)
        if (str(mp_pose.PoseLandmark.LEFT_ELBOW) in body_points and
            str(mp_pose.PoseLandmark.LEFT_WRIST) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_ELBOW)],
                     body_points[str(mp_pose.PoseLandmark.LEFT_WRIST)], (255, 0, 0), 5)

        if (str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_ELBOW) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_ELBOW)], (255, 0, 0), 5)
        if (str(mp_pose.PoseLandmark.RIGHT_ELBOW) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_WRIST) in body_points):
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.RIGHT_ELBOW)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_WRIST)], (255, 0, 0), 5)

    # Draw the hand keypoints if available
    if 'hand' in keypoints:
        hand_points = keypoints['hand']
        for i in range(len(hand_points)):
            cv2.circle(frame, hand_points[i], 5, (0, 255, 0), -1)
            if i > 0:
                cv2.line(frame, hand_points[i-1], hand_points[i], (0, 255, 0), 2)

# Function to create the animation for the given number
def create_sign_language_animation(number, frame_directory):
    label_to_video_mapping = {
        1: "video_1",
        2: "video_2",
        3: ["video_3", "video_4"],
        4: ["video_5", "video_6"],
        5: ["video_7", "video_8"],
        6: "video_9",
        7: "video_10",
        8: "video_11",
        9: "video_12",
        10: "video_13"
    }

    if number not in label_to_video_mapping:
        print("Number out of range. Please input a number between 1 and 10.")
        return
 
    video_dirs = label_to_video_mapping[number]
    if not isinstance(video_dirs, list):
        video_dirs = [video_dirs]
    
    output_video_path = f'sign_language_{number}_stick_figure.mp4'
    output_video = None

    for video_dir in video_dirs:
        video_path = os.path.join(frame_directory, video_dir)
        if not os.path.exists(video_path):
            print(f"No frames found for video {video_dir}")
            continue
        
        frame_list = sorted(os.listdir(video_path))
        
        for frame_name in frame_list:
            frame_path = os.path.join(video_path, frame_name)
            original_frame = cv2.imread(frame_path)
            if original_frame is None:
                print(f"Failed to read frame {frame_name}")
                continue
            
            keypoints = extract_keypoints_from_frame(original_frame)
            
            frame_height, frame_width, _ = original_frame.shape
            stick_figure_frame = np.ones((frame_height, frame_width, 3), dtype=np.uint8) * 255  # White background
            
            # Check if keypoints were detected
            if not keypoints:
                print(f"No keypoints detected in frame: {frame_name}")
                continue
            
            draw_stick_figure(stick_figure_frame, keypoints)

            if output_video is None:
                output_video = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 10, (frame_width, frame_height))
            
            output_video.write(stick_figure_frame)

    if output_video:
        output_video.release()
        print(f"Stick figure sign language animation for number {number} saved to {output_video_path}.")
    else:
        print("No video was created due to missing frames or keypoints.")

# Main code to run the animation
frame_directory = '/kaggle/working'  # Update this path to the directory containing the frames

# Take input from the user
try:
    user_input = int(input("Enter a number between 1 and 10: "))
    create_sign_language_animation(user_input, frame_directory)
except ValueError:
    print("Invalid input. Please enter an integer between 1 and 10.")


W0000 00:00:1724333147.306729     538 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333147.335170     537 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333147.474019     541 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724333147.500201     541 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Enter a number between 1 and 10:  3


Failed to read frame processed_frames
Failed to read frame processed_frames
Stick figure sign language animation for number 3 saved to sign_language_3_stick_figure.mp4.


In [43]:
import cv2
import mediapipe as mp
import numpy as np
import os

# Initialize MediaPipe for hand, pose, and face mesh detection
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_face_mesh = mp.solutions.face_mesh
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, min_detection_confidence=0.5)

# Function to extract key points from a frame
def extract_keypoints_from_frame(frame):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process hand, body, and face mesh keypoints
    hand_results = hands.process(rgb_frame)
    pose_results = pose.process(rgb_frame)
    face_mesh_results = face_mesh.process(rgb_frame)
    
    keypoints = {}

    # Extract relevant hand keypoints
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            hand_points = []
            for point in mp_hands.HandLandmark:
                landmark = hand_landmarks.landmark[point]
                hand_points.append((int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])))
            keypoints['hand'] = hand_points
    
    # Extract relevant body keypoints
    if pose_results.pose_landmarks:
        body_points = {}
        for point in [mp_pose.PoseLandmark.NOSE, mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.RIGHT_SHOULDER,
                      mp_pose.PoseLandmark.LEFT_ELBOW, mp_pose.PoseLandmark.RIGHT_ELBOW, mp_pose.PoseLandmark.LEFT_WRIST,
                      mp_pose.PoseLandmark.RIGHT_WRIST]:
            landmark = pose_results.pose_landmarks.landmark[point]
            body_points[str(point)] = (int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0]))
        keypoints['body'] = body_points
    
    # Extract relevant face mesh keypoints for facial details
    if face_mesh_results.multi_face_landmarks:
        face_points = []
        for face_landmarks in face_mesh_results.multi_face_landmarks:
            for landmark in face_landmarks.landmark:
                face_points.append((int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])))
        keypoints['face'] = face_points
    
    return keypoints

# Function to draw the 2D character based on key points
def draw_2d_character(frame, keypoints):
    if 'body' in keypoints:
        body_points = keypoints['body']
        if str(mp_pose.PoseLandmark.NOSE) in body_points:
            # Draw the head
            cv2.circle(frame, body_points[str(mp_pose.PoseLandmark.NOSE)], 50, (0, 0, 255), 2)

        if (str(mp_pose.PoseLandmark.LEFT_SHOULDER) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_SHOULDER) in body_points):
            # Draw the torso
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)], (0, 0, 255), 8)
            torso_center = (
                (body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)][0] + body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)][0]) // 2,
                (body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)][1] + body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)][1]) // 2 + 100
            )
            cv2.line(frame, torso_center, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)], (0, 0, 255), 8)
            cv2.line(frame, torso_center, body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)], (0, 0, 255), 8)

        if (str(mp_pose.PoseLandmark.LEFT_ELBOW) in body_points and
            str(mp_pose.PoseLandmark.LEFT_WRIST) in body_points):
            # Draw the left arm
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.LEFT_ELBOW)], (0, 0, 255), 8)
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.LEFT_ELBOW)],
                     body_points[str(mp_pose.PoseLandmark.LEFT_WRIST)], (0, 0, 255), 8)

        if (str(mp_pose.PoseLandmark.RIGHT_ELBOW) in body_points and
            str(mp_pose.PoseLandmark.RIGHT_WRIST) in body_points):
            # Draw the right arm
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.RIGHT_SHOULDER)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_ELBOW)], (0, 0, 255), 8)
            cv2.line(frame, body_points[str(mp_pose.PoseLandmark.RIGHT_ELBOW)],
                     body_points[str(mp_pose.PoseLandmark.RIGHT_WRIST)], (0, 0, 255), 8)

    # Draw the hand keypoints if available
    if 'hand' in keypoints:
        hand_points = keypoints['hand']
        for i in range(len(hand_points)):
            cv2.circle(frame, hand_points[i], 10, (0, 255, 0), -1)
            if i > 0:
                cv2.line(frame, hand_points[i-1], hand_points[i], (0, 255, 0), 4)

    # Draw facial features
    if 'face' in keypoints:
        face_points = keypoints['face']

        if len(face_points) > 0:
            # Draw face outline
            center_x = face_points[1][0]
            center_y = face_points[1][1]
            cv2.circle(frame, (center_x, center_y), 50, (0, 0, 255), 2)

            # Draw eyes
            left_eye_center = face_points[362]
            right_eye_center = face_points[33]
            cv2.circle(frame, left_eye_center, 5, (0, 0, 255), -1)
            cv2.circle(frame, right_eye_center, 5, (0, 0, 255), -1)

            # Draw mouth
            left_mouth_corner = face_points[78]
            right_mouth_corner = face_points[308]
            cv2.line(frame, left_mouth_corner, right_mouth_corner, (0, 0, 255), 4)

# Function to create the animation for the given number
def create_sign_language_animation(number, frame_directory):
    label_to_video_mapping = {
        1: "video_1",
        2: "video_2",
        3: ["video_3", "video_4"],
        4: ["video_5", "video_6"],
        5: ["video_7", "video_8"],
        6: "video_9",
        7: "video_10",
        8: "video_11",
        9: "video_12",
        10: "video_13"
    }

    if number not in label_to_video_mapping:
        print("Number out of range. Please input a number between 1 and 10.")
        return
 
    video_dirs = label_to_video_mapping[number]
    if not isinstance(video_dirs, list):
        video_dirs = [video_dirs]
    
    output_video_path = f'sign_language_{number}_2d_character.mp4'
    output_video = None

    for video_dir in video_dirs:
        video_path = os.path.join(frame_directory, video_dir)
        if not os.path.exists(video_path):
            print(f"No frames found for video {video_dir}")
            continue
        
        frame_list = sorted(os.listdir(video_path))
        
        for frame_name in frame_list:
            frame_path = os.path.join(video_path, frame_name)
            original_frame = cv2.imread(frame_path)
            if original_frame is None:
                print(f"Failed to read frame {frame_name}")
                continue
            
            keypoints = extract_keypoints_from_frame(original_frame)
            
            frame_height, frame_width, _ = original_frame.shape
            character_frame = np.ones((frame_height, frame_width, 3), dtype=np.uint8) * 255  # White background
            
            # Check if keypoints were detected
            if not keypoints:
                print(f"No keypoints detected in frame: {frame_name}")
                continue
            
            draw_2d_character(character_frame, keypoints)

            if output_video is None:
                output_video = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 10, (frame_width, frame_height))
            
            output_video.write(character_frame)

    if output_video:
        output_video.release()
        print(f"2D character sign language animation for number {number} saved to {output_video_path}.")
    else:
        print("No video was created due to missing frames or keypoints.")

# Main code to run the animation
frame_directory = '/kaggle/working'  # Update this path to the directory containing the frames

# Take input from the user
try:
    user_input = int(input("Enter a number between 1 and 10: "))
    create_sign_language_animation(user_input, frame_directory)
except ValueError:
    print("Invalid input. Please enter an integer between 1 and 10.")


W0000 00:00:1724334442.711104     625 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724334442.735801     633 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724334442.744725     625 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724334442.746114     633 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724334442.876412     629 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1724334442.903222     629 inference_feedback_manager.cc:114] Feedback manager 

Enter a number between 1 and 10:  5


Failed to read frame processed_frames
Failed to read frame processed_frames
2D character sign language animation for number 5 saved to sign_language_5_2d_character.mp4.
