In [2]:
from math import isnan
from torch.utils.data import DataLoader
from dataloader import LipReadingDataset
from speak import Speaking_conv3d_layers, Speaking_words, Speaking_model, Speaking_conv3d_layers_trimmed
import torch
import os
from tqdm import tqdm
import argparse
from torch.utils.tensorboard import SummaryWriter
import datetime
from torch import nn
from transformers import get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup
import torchvision
from transformers import AutoTokenizer
from main import validate

%load_ext autoreload
%autoreload 2

torch.manual_seed(42)



<torch._C.Generator at 0x2b9c81bc8f0>

In [11]:
%%bash
pwd

/mnt/d/classes/project/lip_reading/speaking


In [29]:
length_video = 125 
tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = Speaking_conv3d_layers(512, length_video)
weights = torch.load("../saved_weights/speak/best_weight_2_epochs.state")["state_dict"]
print(list(weights.keys()))
for key, value in list(weights.items()):  # model layer names are encoder.encoder instead of encoder.
    if "encoder" in key:
        weights.pop(key)
        weights[f"encoder.{key}"] = value

model.load_state_dict(weights)





['encoder.0.conv_block.0.weight', 'encoder.0.conv_block.0.bias', 'encoder.0.conv_block.1.weight', 'encoder.0.conv_block.1.bias', 'encoder.0.conv_block.1.running_mean', 'encoder.0.conv_block.1.running_var', 'encoder.0.conv_block.1.num_batches_tracked', 'encoder.1.conv_block.0.weight', 'encoder.1.conv_block.0.bias', 'encoder.1.conv_block.1.weight', 'encoder.1.conv_block.1.bias', 'encoder.1.conv_block.1.running_mean', 'encoder.1.conv_block.1.running_var', 'encoder.1.conv_block.1.num_batches_tracked', 'encoder.2.conv_block.0.weight', 'encoder.2.conv_block.0.bias', 'encoder.2.conv_block.1.weight', 'encoder.2.conv_block.1.bias', 'encoder.2.conv_block.1.running_mean', 'encoder.2.conv_block.1.running_var', 'encoder.2.conv_block.1.num_batches_tracked', 'encoder.3.conv_block.0.weight', 'encoder.3.conv_block.0.bias', 'encoder.3.conv_block.1.weight', 'encoder.3.conv_block.1.bias', 'encoder.3.conv_block.1.running_mean', 'encoder.3.conv_block.1.running_var', 'encoder.3.conv_block.1.num_batches_track

<All keys matched successfully>

In [3]:
base_path = "D:/classes/project/LRS2/extracted_data/mvlrs_v1/"
dataset_name="pretrain"

train_dataset = LipReadingDataset(directory=base_path,
                            transform=None,
                            length_video=length_video,
                            mode="speak",
                            tokenizer=tokenizer,
                            dataset=dataset_name,
                            useOriginalDataStructure=True,
                            h_w = (160,160)
                            )
print("Total samples loaded:", len(train_dataset))  
train_set, val_set = torch.utils.data.random_split(train_dataset, [int(len(train_dataset)*0.9), len(train_dataset) - int(len(train_dataset)*0.9)])

train_data_loader = DataLoader(
    train_set,
    batch_size=1,
    shuffle=True,
    num_workers=4,
    pin_memory=False,
    prefetch_factor=2
    
    )

val_data_loader = DataLoader(
    val_set,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    pin_memory=False,
    prefetch_factor=2,
    )
print("Total samples train:", len(train_data_loader))  
print("Total samples val:", len(val_data_loader))  


Total samples loaded: 96318
Total samples train: 86686
Total samples val: 9632


In [4]:
device = "cuda"
model.to(device)
weight = torch.tensor([0.77, 0.23]).to(device)
criterion = nn.CrossEntropyLoss(weight=weight)

In [17]:
validate(val_data_loader, model, criterion)

100%|██████████| 9632/9632 [06:55<00:00, 23.19it/s, Ones_frac=75.41%, accuracy=77.63%, zeros_frac=24.59%]

ones: 907887, zeros: 296113, total: 1204000, one_frac: 0.7540589570999146 zeors_frac: 0.24594102799892426
total_correct: 934646 total_words: 1204000
accuracy: 0.7762840531561461
loss: 5224.454869732261





0.7762840531561461

Now lets trim the weights and layers

In [5]:
tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")
length_video = 125 
device = "cuda"
model = Speaking_conv3d_layers_trimmed(512, length_video)
weights = torch.load("../trimmed_shifted.state")["state_dict"]
model.load_state_dict(weights)
model.to(device)
model.to(device)
weight = torch.tensor([0.77, 0.23]).to(device)
criterion = nn.CrossEntropyLoss(weight=weight)

In [7]:
base_path = "D:/classes/project/LRS2/extracted_data/mvlrs_v1/"
dataset_name="pretrain"

train_dataset = LipReadingDataset(directory=base_path,
                            transform=None,
                            length_video=length_video,
                            mode="speak",
                            tokenizer=tokenizer,
                            dataset=dataset_name,
                            useOriginalDataStructure=True,
                            h_w = (160,160)
                            )
print("Total samples loaded:", len(train_dataset))  
train_set, val_set = torch.utils.data.random_split(train_dataset, [int(len(train_dataset)*0.9), len(train_dataset) - int(len(train_dataset)*0.9)])

train_data_loader = DataLoader(
    train_set,
    batch_size=1,
    shuffle=True,
    num_workers=4,
    pin_memory=False,
    prefetch_factor=2
    
    )

val_data_loader = DataLoader(
    val_set,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    pin_memory=False,
    prefetch_factor=2,
    )
print("Total samples train:", len(train_data_loader))  
print("Total samples val:", len(val_data_loader))  


Total samples loaded: 96318
Total samples train: 86686
Total samples val: 9632


Average pooling (2) the time and H W domains to (45,4,4) makes the model suffer ataining a ~51.58% accuracy

`self.nums_int[:, :, :, :4, :4] = self.avg(x)[:, :, :45]
self.avg = nn.AvgPool3d((2))
self.nums_int = torch.zeros((1, 512, 45, 5, 5)).to("cuda")`

<!-- self.avg = nn.AvgPool3d(2)
self.nums_int = torch.zeros((1, 512, 45, 5, 5)).to("cuda")

self.linear = nn.Linear(d_model*3*3*length_of_video_in_frames, 2*length_of_video_in_frames) # output size is 1 or zero times the temporal dimension

def forward(self, x):

x = self.encoder(x)
self.nums_int[:, :, :, :4, :4] = self.avg(x)[:, :, :45] -->

In [89]:
validate(val_data_loader, model, criterion)

100%|██████████| 9632/9632 [06:24<00:00, 25.03it/s, Ones_frac=75.52%, accuracy=51.58%, zeros_frac=24.48%]

ones: 909272, zeros: 294728, total: 1204000, one_frac: 0.7552093267440796 zeors_frac: 0.2447907030582428
total_correct: 621072 total_words: 1204000
accuracy: 0.5158405315614618
loss: 19001.281693696976





0.5158405315614618

self.avg(x)[:, :, :, :3, :3]
self.avg = nn.AvgPool3d((1,2,2))
self.nums_int = torch.zeros((1, 512, 125, 3, 3)).to("cuda")

48%

In [93]:
validate(val_data_loader, model, criterion)

100%|██████████| 9632/9632 [06:26<00:00, 24.90it/s, Ones_frac=75.52%, accuracy=48.10%, zeros_frac=24.48%]

ones: 909272, zeros: 294728, total: 1204000, one_frac: 0.7552093267440796 zeors_frac: 0.2447907030582428
total_correct: 579106 total_words: 1204000
accuracy: 0.48098504983388707
loss: 31404.948693454266





0.48098504983388707

nn.AvgPool3d((2,1,1))
self.nums_int =    self.avg(x)[:, :, :45, :5, :5]
self.nums_int = torch.zeros((1, 512, 45, 8, 8)).to("cuda")

57.9%

In [96]:
validate(val_data_loader, model, criterion)

100%|██████████| 9632/9632 [06:27<00:00, 24.86it/s, Ones_frac=75.52%, accuracy=57.91%, zeros_frac=24.48%]

ones: 909272, zeros: 294728, total: 1204000, one_frac: 0.7552093267440796 zeors_frac: 0.2447907030582428
total_correct: 697188 total_words: 1204000
accuracy: 0.5790598006644518
loss: 31364.614983811975





0.5790598006644518

nn.maxPool3d((2,1,1))
self.nums_int =    self.avg(x)[:, :, :45, :5, :5]
self.nums_int = torch.zeros((1, 512, 45, 8, 8)).to("cuda")

same as before

In [98]:
validate(val_data_loader, model, criterion)

100%|██████████| 9632/9632 [06:28<00:00, 24.76it/s, Ones_frac=75.52%, accuracy=57.91%, zeros_frac=24.48%]

ones: 909272, zeros: 294728, total: 1204000, one_frac: 0.7552093267440796 zeors_frac: 0.2447907030582428
total_correct: 697188 total_words: 1204000
accuracy: 0.5790598006644518
loss: 31364.614983811975





0.5790598006644518

`self.nums_int[:, :, :, :4, :4] = self.avg(x)[:, :, :45]
self.avg = nn.maxPool3d((2))
self.nums_int = torch.zeros((1, 512, 45, 5, 5)).to("cuda")`

In [9]:
validate(val_data_loader, model, criterion)

100%|██████████| 9632/9632 [06:29<00:00, 24.73it/s, Ones_frac=75.47%, accuracy=47.94%, zeros_frac=24.53%]

ones: 908697, zeros: 295303, total: 1204000, one_frac: 0.7547317147254944 zeors_frac: 0.24526827037334442
total_correct: 577221 total_words: 1204000
accuracy: 0.4794194352159468
loss: 42005.11504986882





0.4794194352159468