In [2]:
import importlib
import numpy as np
import torch
from collections import OrderedDict
import utils

In [2]:
def import_class(name):
    components = name.rsplit('.', 1)
    print(components)
    mod = importlib.import_module(components[0])
    mod = getattr(mod, components[1])
    print(mod)
    return mod

In [3]:
model_args = {"num_classes": 1296, "c2d_type": "resnet18", "conv_type": 2, "use_bn": 1}
loss_weights = {"ConvCTC": 1.0, "SeqCTC": 1.0, "Dist": 10.0}
gloss_dict = np.load('/home/aayush/Thesis/VAC_CSLR/preprocess/phoenix2014/gloss_dict.npy', allow_pickle=True)\
                .item()


model_class = import_class("slr_network.SLRModel")
model = model_class(**model_args,
                    gloss_dict=gloss_dict,
                    loss_weights=loss_weights)

['slr_network', 'SLRModel']
<class 'slr_network.SLRModel'>


In [4]:
def modified_weights(state_dict, modified=False):
    state_dict = OrderedDict([(k.replace('.module', ''), v) for k, v in state_dict.items()])
    if not modified:
        return state_dict
    modified_dict = dict()
    return modified_dict

In [5]:
state_dict = torch.load('resnet18_slr_pretrained_distill25.pt', map_location=torch.device('cpu'))
weights = modified_weights(state_dict['model_state_dict'], False)
model.load_state_dict(weights, strict=True)

<All keys matched successfully>

In [6]:
model.eval()

SLRModel(
  (conv2d): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_run

# Input format modification

In [7]:
import torch.utils.data as data
from utils import video_augmentation
import os
import glob
import cv2
import time
import re

In [22]:
def resize_img(img_path, dsize='256x256px'):
    dsize = tuple(int(res) for res in re.findall("\d+", dsize))
    img = cv2.imread(img_path)
    img = cv2.resize(img, dsize, interpolation=cv2.INTER_LANCZOS4)
    return img

In [70]:
frames_folder_name = '02October_2010_Saturday_tagesschau-1300'
frames_folder_path = './dataset/dataset/' + frames_folder_name + '/*.png'
img_list = glob.glob(frames_folder_path)
for img_path in img_list:
    rs_img = resize_img(img_path)
    cv2.imwrite(img_path, rs_img)

In [71]:
class SampleBaseFeeder(data.Dataset):
    
    def __init__(self, prefix, gloss_dict):
        self.prefix = prefix # image frames path : ./01April_2010_Thursday_heute_default-5/1/*.png
        self.dict = gloss_dict
        self.data_aug = self.transform()
        
    def __getitem__(self, idx):
        
        input_data, label, fi = self.read_video(idx)
        input_data, label = self.normalize(input_data, label)
        
        return input_data, torch.LongTensor(label), ''#self.inputs_list[idx]['original_info']

    
    def read_video(self, index, num_glosses=-1):
        # load file info        
        img_folder = os.path.join(self.prefix)
        img_list = sorted(glob.glob(img_folder))
        label_list = []
                
        return [cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) for img_path in img_list],\
            label_list, {}

    
    def normalize(self, video, label, file_id=None):
        video, label = self.data_aug(video, label, file_id)
        video = video.float() / 127.5 - 1
        #print(typvideo[0][0])
        #print(label)
        #plt.imshow(cv2.cvtColor(video[0][0], cv2.COLOR_BGR2RGB))
        return video, label

    
    def transform(self):
        print("Apply testing transform.")
        return video_augmentation.Compose([
            video_augmentation.CenterCrop(224),
            video_augmentation.ToTensor(),
        ]) 
    
    
    @staticmethod
    def collate_fn(batch):
        #print("In collate_fn")
        batch = [item for item in sorted(batch, key=lambda x: len(x[0]), reverse=True)]
        video, label, info = list(zip(*batch))
        if len(video[0].shape) > 3:
            max_len = len(video[0])
            video_length = torch.LongTensor([np.ceil(len(vid) / 4.0) * 4 + 12 for vid in video])
            left_pad = 6
            right_pad = int(np.ceil(max_len / 4.0)) * 4 - max_len + 6
            max_len = max_len + left_pad + right_pad
            padded_video = [torch.cat(
                (
                    vid[0][None].expand(left_pad, -1, -1, -1),
                    vid,
                    vid[-1][None].expand(max_len - len(vid) - left_pad, -1, -1, -1),
                )
                , dim=0)
                for vid in video]
            padded_video = torch.stack(padded_video)
        else:
            max_len = len(video[0])
            video_length = torch.LongTensor([len(vid) for vid in video])
            padded_video = [torch.cat(
                (
                    vid,
                    vid[-1][None].expand(max_len - len(vid), -1),
                )
                , dim=0)
                for vid in video]
            padded_video = torch.stack(padded_video).permute(0, 2, 1)
        label_length = torch.LongTensor([len(lab) for lab in label])
        if max(label_length) == 0:
            return padded_video, video_length, [], [], info
        else:
            padded_label = []
            for lab in label:
                padded_label.extend(lab)
            padded_label = torch.LongTensor(padded_label)
            return padded_video, video_length, padded_label, label_length, info
        
    def __len__(self):
        return 1 # as for prediction we just have one folder/video

In [72]:
len(gloss_dict)

1295

In [73]:
pred_dataset = SampleBaseFeeder(frames_folder_path, gloss_dict)

Apply testing transform.


In [74]:
pred_dataset

<__main__.SampleBaseFeeder at 0x7fd8facef2e0>

In [75]:
data_loader = torch.utils.data.DataLoader(
            pred_dataset,
            batch_size=1,
            shuffle=False,
            drop_last=False,
            num_workers=4, 
            collate_fn=pred_dataset.collate_fn,
        )

In [76]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x7fd8f9fd5070>

In [77]:
start = time.time()
loader = data_loader
for batch_idx, l_data in enumerate(loader):
    print(batch_idx)
    device = utils.GpuDataParallel()
    vid = device.data_to_device(l_data[0])
    vid_lgt = device.data_to_device(l_data[1])
    label = device.data_to_device(l_data[2])
    label_lgt = device.data_to_device(l_data[3])

with torch.no_grad():
    ret_dict = model(vid, vid_lgt, label=label, label_lgt=label_lgt)

end = time.time()
print("Time taken to predict: ", round(end - start))

  video_length = torch.LongTensor([np.ceil(len(vid) / 4.0) * 4 + 12 for vid in video])


0
In slr_network.py forward:
torch.Size([1, 76, 3, 224, 224])
tensor([76])
<class 'torch.Tensor'>
Time taken to predict:  3


In [78]:
ret_dict

{'framewise_features': tensor([[[1.3638, 1.3638, 1.3638,  ..., 0.0042, 0.0042, 0.0042],
          [1.8291, 1.8291, 1.8291,  ..., 0.7396, 0.7396, 0.7396],
          [0.6406, 0.6406, 0.6406,  ..., 0.0068, 0.0068, 0.0068],
          ...,
          [0.6928, 0.6928, 0.6928,  ..., 1.3487, 1.3487, 1.3487],
          [1.5674, 1.5674, 1.5674,  ..., 0.5900, 0.5900, 0.5900],
          [1.4145, 1.4145, 1.4145,  ..., 0.8316, 0.8316, 0.8316]]]),
 'visual_features': tensor([[[0.0000, 0.1916, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
 
         [[0.0000, 0.3052, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
 
         [[0.3343, 0.0000, 0.2555,  ..., 0.0000, 0.0000, 1.0071]],
 
         ...,
 
         [[1.2935, 3.4471, 0.0000,  ..., 0.0000, 0.8093, 0.0000]],
 
         [[0.9834, 3.4825, 0.0000,  ..., 0.0000, 0.6124, 0.0000]],
 
         [[0.4559, 2.6617, 0.5808,  ..., 0.0000, 0.0969, 0.0000]]]),
 'feat_len': tensor([16]),
 'conv_logits': tensor([[[ -9.6937, -25.9843, -25.8327,  ..., -23.8951, -23.4979, -24.33

In [79]:
sentence = ''
for word, id in ret_dict['recognized_sents'][0]:
    sentence += word + " "
print(sentence)

__ON__ MILD WEHEN ICH 


In [None]:
del model

In [3]:
# to know a folder info looks like.

inputs_list = np.load(f"./preprocess/phoenix2014/test_info.npy", allow_pickle=True).item()
for x in inputs_list:
    if x == 'prefix':
        continue
    if inputs_list[x]['fileid'] == frames_folder_name:
        print(inputs_list[x])


NameError: name 'frames_folder_name' is not defined

In [4]:
inputs_list

{'prefix': 'VAC_CSLR/dataset/phoenix2014-release/phoenix-2014-multisigner/features/fullFrame-210x260px',
 0: {'fileid': '01April_2010_Thursday_heute_default-5',
  'folder': 'test/01April_2010_Thursday_heute_default-5/1/*.png',
  'signer': 'Signer04',
  'label': 'ABER FREUEN  MORGEN SONNE SELTEN REGEN',
  'num_frames': 132,
  'original_info': '01April_2010_Thursday_heute_default-5|01April_2010_Thursday_heute_default-5/1/*.png|Signer04|ABER FREUEN  MORGEN SONNE SELTEN REGEN'},
 1: {'fileid': '01April_2010_Thursday_tagesschau_default-7',
  'folder': 'test/01April_2010_Thursday_tagesschau_default-7/1/*.png',
  'signer': 'Signer04',
  'label': 'SAMSTAG WECHSELHAFT BESONDERS FREUNDLICH NORDOST BISSCHEN BEREICH',
  'num_frames': 144,
  'original_info': '01April_2010_Thursday_tagesschau_default-7|01April_2010_Thursday_tagesschau_default-7/1/*.png|Signer04|SAMSTAG WECHSELHAFT BESONDERS FREUNDLICH NORDOST BISSCHEN BEREICH'},
 2: {'fileid': '01April_2010_Thursday_tagesschau_default-8',
  'folder'

# For prediction

In [None]:
!pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 opencv-python mediapipe sklearn matplotlib

In [None]:
import os
from matplotlib import pyplot as plt
import mediapipe as mp
import cv2

In [None]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [None]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [None]:
def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [None]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh]), lh, rh

In [None]:
sequence = []
sentence = []
threshold = 0.6
count=0

cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
        print(frame.shape)
        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        #print(image.shape)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # Prediction logic
        keypoints, lh_keypoints, rh_keypoints = extract_keypoints(results)        
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30 and not (np.sum(lh_keypoints) == 0 and np.sum(rh_keypoints) == 0):
            #res = model(image, len(image))
            #print(actions[np.argmax(res)])
            #print(res)
            cv2.imwrite("frame_collection/frame%d.jpg" % count, frame) 
            count += 1
            
            
        # Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-3:]

            
        #cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('Live SLR Detection', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

In [None]:
cap.release()
cv2.destroyAllWindows()