# Load model

In [1]:
import importlib
import numpy as np
import torch
from collections import OrderedDict
import utils

In [2]:
def import_class(name):
    components = name.rsplit('.', 1)
    print(components)
    mod = importlib.import_module(components[0])
    mod = getattr(mod, components[1])
    print(mod)
    return mod

In [3]:
model_args = {"num_classes": 1296, "c2d_type": "resnet18", "conv_type": 2, "use_bn": 1}
loss_weights = {"ConvCTC": 1.0, "SeqCTC": 1.0, "Dist": 10.0}
gloss_dict = np.load('/home/aayush/Thesis/VAC_CSLR/preprocess/phoenix2014/gloss_dict.npy', allow_pickle=True)\
                .item()


model_class = import_class("slr_network.SLRModel")
model = model_class(**model_args,
                    gloss_dict=gloss_dict,
                    loss_weights=loss_weights)

['slr_network', 'SLRModel']
<class 'slr_network.SLRModel'>


In [4]:
def modified_weights(state_dict, modified=False):
    state_dict = OrderedDict([(k.replace('.module', ''), v) for k, v in state_dict.items()])
    if not modified:
        return state_dict
    modified_dict = dict()
    return modified_dict

In [5]:
state_dict = torch.load('resnet18_slr_pretrained_distill25.pt', map_location=torch.device('cpu'))
weights = modified_weights(state_dict['model_state_dict'], False)
model.load_state_dict(weights, strict=True)

<All keys matched successfully>

In [6]:
model.eval()

SLRModel(
  (conv2d): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_run

# Input format modification

In [7]:
import torch.utils.data as data
from utils import video_augmentation
import os
import glob
import cv2
import time
import re

In [8]:
def resize_img(img_path, dsize='256x256px'):
    dsize = tuple(int(res) for res in re.findall("\d+", dsize))
    img = cv2.imread(img_path)
    img = cv2.resize(img, dsize, interpolation=cv2.INTER_LANCZOS4)
    return img


def resize_img_folder(frames_folder_path):
    img_list = glob.glob(frames_folder_path)
    for img_path in img_list:
        rs_img = resize_img(img_path)
        cv2.imwrite(img_path, rs_img)

In [19]:
frames_folder_name = '07February_2011_Monday_heute-4659'
frames_folder_path = './dataset/dataset/' + frames_folder_name + '/*.png'
#frames_folder_path = './frames_collection/recording_0/*.png'
resize_img_folder(frames_folder_path)

In [9]:
class SampleBaseFeeder(data.Dataset):
    
    def __init__(self, prefix, gloss_dict):
        self.prefix = prefix # image frames path : ./01April_2010_Thursday_heute_default-5/1/*.png
        self.dict = gloss_dict
        self.data_aug = self.transform()
        
    def __getitem__(self, idx):
        
        input_data, label, fi = self.read_video(idx)
        input_data, label = self.normalize(input_data, label)
        
        return input_data, torch.LongTensor(label), ''#self.inputs_list[idx]['original_info']

    
    def read_video(self, index, num_glosses=-1):
        # load file info        
        img_folder = os.path.join(self.prefix)
        img_list = sorted(glob.glob(img_folder))
        label_list = []
                
        return [cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) for img_path in img_list],\
            label_list, {}

    
    def normalize(self, video, label, file_id=None):
        video, label = self.data_aug(video, label, file_id)
        video = video.float() / 127.5 - 1
        return video, label

    
    def transform(self):
        print("Apply testing transform.")
        return video_augmentation.Compose([
            video_augmentation.CenterCrop(224),
            video_augmentation.ToTensor(),
        ]) 
    
    
    @staticmethod
    def collate_fn(batch):
        batch = [item for item in sorted(batch, key=lambda x: len(x[0]), reverse=True)]
        video, label, info = list(zip(*batch))
        if len(video[0].shape) > 3:
            max_len = len(video[0])
            video_length = torch.LongTensor([np.ceil(len(vid) / 4.0) * 4 + 12 for vid in video])
            left_pad = 6
            right_pad = int(np.ceil(max_len / 4.0)) * 4 - max_len + 6
            max_len = max_len + left_pad + right_pad
            padded_video = [torch.cat(
                (
                    vid[0][None].expand(left_pad, -1, -1, -1),
                    vid,
                    vid[-1][None].expand(max_len - len(vid) - left_pad, -1, -1, -1),
                )
                , dim=0)
                for vid in video]
            padded_video = torch.stack(padded_video)
        else:
            max_len = len(video[0])
            video_length = torch.LongTensor([len(vid) for vid in video])
            padded_video = [torch.cat(
                (
                    vid,
                    vid[-1][None].expand(max_len - len(vid), -1),
                )
                , dim=0)
                for vid in video]
            padded_video = torch.stack(padded_video).permute(0, 2, 1)
        label_length = torch.LongTensor([len(lab) for lab in label])
        if max(label_length) == 0:
            return padded_video, video_length, [], [], info
        else:
            padded_label = []
            for lab in label:
                padded_label.extend(lab)
            padded_label = torch.LongTensor(padded_label)
            return padded_video, video_length, padded_label, label_length, info
        
    def __len__(self):
        return 1 # as for prediction we just have one folder/video

In [10]:
gloss_dict

{'A': [1, 13],
 'AACHEN': [2, 3],
 'AB': [3, 79],
 'AB-JETZT': [4, 1],
 'AB-PLUSPLUS': [5, 1],
 'AB-SO': [6, 1],
 'ABEND': [7, 500],
 'ABER': [8, 439],
 'ABFALLEN': [9, 1],
 'ABKUEHLEN': [10, 1],
 'ABSCHIED': [11, 1],
 'ABSCHNITT': [12, 3],
 'ABSINKEN': [13, 2],
 'ABWECHSELN': [14, 8],
 'ACH': [15, 16],
 'ACHT': [16, 163],
 'ACHTE': [17, 16],
 'ACHTHUNDERT': [18, 5],
 'ACHTUNG': [19, 14],
 'ACHTZEHN': [20, 97],
 'ACHTZIG': [21, 3],
 'AEHNLCH': [22, 1],
 'AEHNLICH': [23, 22],
 'AENDERN': [24, 1],
 'AFRIKA': [25, 1],
 'AKTIV': [26, 2],
 'AKTUELL': [27, 3],
 'ALLE': [28, 12],
 'ALLGAEU': [29, 32],
 'ALLGEMEIN': [30, 2],
 'ALPEN': [31, 207],
 'ALPENRAND': [32, 1],
 'ALPENTAL': [33, 1],
 'ALS': [34, 15],
 'ALSO': [35, 3],
 'ALT': [36, 2],
 'AM': [37, 8],
 'AM-KUESTE': [38, 2],
 'AM-MEER': [39, 1],
 'AM-RAND': [40, 1],
 'AM-TAG': [41, 12],
 'AMERIKA': [42, 1],
 'AN': [43, 3],
 'ANDERE': [44, 22],
 'ANDERE-MOEGLICHKEIT': [45, 1],
 'ANDERS': [46, 5],
 'ANFANG': [47, 149],
 'ANGEMESSEN': [48, 3

In [11]:
def get_data_loader(frames_folder_path, gloss_dict):
    
    pred_dataset = SampleBaseFeeder(frames_folder_path, gloss_dict)
    data_loader = torch.utils.data.DataLoader(
                pred_dataset,
                batch_size=8,
                shuffle=False,
                drop_last=False,
                num_workers=4, 
                collate_fn=pred_dataset.collate_fn,
            )
    
    return data_loader

In [20]:
def get_prediction_from_frames(frames_folder_path, gloss_dict):
    start = time.time()
    loader = get_data_loader(frames_folder_path, gloss_dict)
    for batch_idx, l_data in enumerate(loader):
        print(l_data[0].shape)
        device = utils.GpuDataParallel()
        vid = device.data_to_device(l_data[0])
        vid_lgt = device.data_to_device(l_data[1])
        label = device.data_to_device(l_data[2])
        label_lgt = device.data_to_device(l_data[3])

    with torch.no_grad():
        ret_dict = model(vid, vid_lgt, label=label, label_lgt=label_lgt)

    end = time.time()
    print("Time taken to predict (total): ", round(end - start))
    return ret_dict

In [None]:
temploader = get_data_loader(frames_folder_path, gloss_dict)
len(temploader)

In [None]:
ret_dict = await get_prediction_from_frames(frames_folder_path, gloss_dict)

In [None]:
sentence = ''
for word, id in ret_dict['recognized_sents'][0]:
    sentence += word + " "
print(sentence)

In [None]:
ret_dict

In [None]:
del model

In [None]:
# to know a folder info looks like.

inputs_list = np.load(f"./preprocess/phoenix2014/test_info.npy", allow_pickle=True).item()
for x in inputs_list:
    if x == 'prefix':
        continue
    if inputs_list[x]['fileid'] == frames_folder_name:
        print(inputs_list[x])


In [None]:
inputs_list

# For live prediction

In [None]:
#!pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 opencv-python mediapipe sklearn matplotlib

In [13]:
from matplotlib import pyplot as plt
import mediapipe as mp

In [14]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

In [15]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [16]:
def draw_styled_landmarks(image, results):
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                              #mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             #mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             #)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [17]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh]), lh, rh

In [23]:
lhsequence = rhsequence = []
sentence = ''
vcount = 0
fcount = 0
recordingDone = False
frames_base_path = 'frames_collection'
frames_path = ''

cap = cv2.VideoCapture('output1.mp4')
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 852)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
#cap.set(cv2.CAP_PROP_FPS, 25)

with mp_holistic.Holistic(min_detection_confidence=0.1,
                          min_tracking_confidence=0.1,
                          model_complexity=0) as holistic:
#with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.1, min_detection_confidence=0.1, min_tracking_confidence=0.1) as hands
    while cap.isOpened():

        # READ CAMERA FEED
        ret, frame = cap.read()

        if frame is not None:
            image, results = mediapipe_detection(frame, holistic)


            # DRAW LANDMARKS
            draw_styled_landmarks(image, results)

            # EXTRACT HANDS KEYPOINTS
            keypoints, lh_keypoints, rh_keypoints = extract_keypoints(results)        
            lhsequence.append(lh_keypoints)
            rhsequence.append(rh_keypoints)
            

        if frame is None and not recordingDone: 
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                
        elif not (np.sum(lhsequence) == 0 and np.sum(rhsequence) == 0) and frame is not None:
        #elif results.multi_hand_landmarks and frame is not None:
            frames_path = os.path.join(frames_base_path, 'recording_' + str(vcount))
            img_path = frames_path + '/frame' + str(fcount) + '.png'
            if not os.path.exists(frames_path):
                os.makedirs(frames_path)
                cv2.imwrite(img_path, frame)
            else:
                cv2.imwrite(img_path, frame) 

            fcount += 1
            recordingDone = True
            
            if len(lhsequence) > 15:
                lhsequence = []
            if len(rhsequence) > 15:
                rhsequence = []

        elif recordingDone:
            # MAKE PREDICTIONS
            resize_img_folder(frames_path+'/*.png')
            prediction_dict = get_prediction_from_frames(frames_path+'/*.png', gloss_dict)
            sentence = ''
            for word, id in prediction_dict['recognized_sents'][0]:
                sentence += word + " "
            print(sentence)
            vcount += 1
            fcount = 0
            recordingDone = False


        cv2.rectangle(image, (0,0), (1280, 30), (0, 0, 0), -1)
        cv2.putText(image, str(re.sub('__[A-Z]*__', '', str(sentence))), (3,20), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)

        
        cv2.imshow('Live SLR Detection', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
                
    cap.release()
    cv2.destroyAllWindows()

Apply testing transform.


  video_length = torch.LongTensor([np.ceil(len(vid) / 4.0) * 4 + 12 for vid in video])


torch.Size([1, 36, 3, 224, 224])
In slr_network.py forward:
torch.Size([1, 36, 3, 224, 224])
tensor([36])
<class 'torch.Tensor'>
Time taken to predict (total):  2
WECHSELHAFT 


In [None]:
cap.release()
cv2.destroyAllWindows()

In [None]:
'SONNE poss-SEIN DANN KOMMEN REGEN MORGEN MONTAG IX AUCH MEHR WOLKE WIE HEUTE FREUNDLICH VIEL SONNE '