In [1]:
import torchvision
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import squeezenet1_1
from torchvision.models.feature_extraction import create_feature_extractor

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [7]:
import torchvision
from torchvision.models import squeezenet1_1
from torchvision.models.feature_extraction import create_feature_extractor

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, device, biDirectional = False):
        super(EncoderRNN, self).__init__()
        
        model = squeezenet1_1(pretrained=True).to(config.device)
        return_nodes = {
            'features.12.cat': 'layer12'
        }
        self.pretrained_model = create_feature_extractor(model, return_nodes=return_nodes).to(config.device)
        self.pretrained_model.eval()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.device = device
        self.D = 2 if biDirectional else 1

        self.rnn = nn.LSTM(
                input_size = self.input_size,
                hidden_size = self.hidden_size*self.D,
                num_layers = 1,
                dropout = 0,
                bidirectional = biDirectional,
                batch_first = True).to(config.device)

    def forward(self, input, hidden):
        features = self.pretrained_model(input.squeeze())['layer12'].to(device=self.device)
        feat_shape = features.shape

        feat_flat =  torch.reshape(features,(1,feat_shape[0],feat_shape[1]*feat_shape[2]*feat_shape[3])).to(device=self.device)

        output, hidden = self.rnn(feat_flat, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(self.D, 1, self.hidden_size*self.D, device=self.device),
                torch.zeros(self.D, 1, self.hidden_size*self.D, device=self.device))

In [8]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, device, dropout_p=0.1, max_length=64, biDirectional = False, debug=False): #max_length=config.max_words_in_sentence
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.debug = debug
        self.device = device

        self.D = 2 if biDirectional else 1

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 3, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.rnn = nn.LSTM(
                input_size = self.hidden_size,
                hidden_size = self.hidden_size*self.D,
                num_layers = 1,
                dropout = 0,
                bidirectional = biDirectional,
                batch_first = True)
        self.out = nn.Linear(self.hidden_size*self.D, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(input.shape[0],input.shape[1], self.hidden_size)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1).to(device=self.device)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs).to(device=self.device)

        output = torch.cat((embedded[0], attn_applied[0]), 1).to(device=self.device)
        output = self.attn_combine(output).unsqueeze(0).to(device=self.device)

        output = F.relu(output)
        output, hidden = self.rnn(output, (hidden[0].unsqueeze(0),hidden[0].unsqueeze(0)))

        output = F.log_softmax(self.out(output[0]), dim=1).to(device=self.device)
        return output, hidden, attn_weights

    def initHidden(self):
        return (torch.zeros(self.D, 1, self.hidden_size*self.D, device=self.device),
                torch.zeros(self.D, 1, self.hidden_size*self.D, device=self.device))

In [4]:
def evaluate(encoder, decoder, frames, max_length = 64):
    with torch.no_grad():
        encoder_hidden = encoder.initHidden()

        encoder_output, encoder_hidden = encoder(frames, encoder_hidden)

        decoder_input = torch.tensor([[encodings['SOS']]], device=device)  # Start of sentence

        decoder_hidden = encoder_hidden

        decoded_words = ''
        decoder_attentions = torch.zeros(max_length, max_length, device=device)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden[0], encoder_output)

            decoder_attentions[di] = decoder_attention.data

            topv, topi = decoder_output.data.topk(1)

            if topi.item() == encodings['EOS']:
                decoded_words += '.'
                break
            else:
                decoded_words += word_idx[topi.item()] + ' '

            decoder_input = topi.detach()

        return decoded_words, decoder_attentions[:di + 1]

In [5]:
from google.colab import drive
drive.mount('/content/drive')
data_folder = 'drive/MyDrive/SLR/Data/'

Mounted at /content/drive


In [10]:

#data_folder = '/home/sign-lang/jamal/Dataset'

encodings = torch.load(data_folder+'/jamal/encodings.dict')
word_idx = torch.load(data_folder+'/jamal/word_idx.dict')

encoder = torch.load(data_folder+'/jamal/encoder_' + device + '.model', map_location=torch.device(device))
decoder = torch.load(data_folder+'/jamal/decoder_' + device + '.model', map_location=torch.device(device))
encoder.eval()
decoder.eval()

# 25 frames in (64,3,224,224) shape
video_frames = torch.zeros((64,3,224,224)).to(device)

output_words, attentions = evaluate(encoder, decoder, video_frames)
print(output_words)

mən bu .
