In [30]:

# Imports

import torch
from torchvision import datasets, models, transforms # All torchvision modules
import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, Loss functions,..
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam,...
import torch.nn.functional as F  # All functions that don't have any parameters
from torch.utils.data import (DataLoader,Dataset)  # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms  # Transformations we can perform on our dataset
import torchtext # Makes it easy to work with sequence data 
from torchtext.data import get_tokenizer

import re # regex library
import os # Doing operating system operations
import cv2 # Computer vision tasks with OpenCV
import numpy as np # Powerful arrray computation library
from PIL import Image # WOrking with image files
import pandas # Extracting data from csv
import math # Math package
import pickle # Saving variables for later usage.

from torchsummary import summary # Make understanding of models easier
import torch # PyTorch library
from time import time # Using timer in code


# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use Cuda if GPU available!


In [6]:
print(device) # Test for device type

cuda


In [2]:
class Utils:
    '''
    This class contains methods which help in processing our dataset
    Args: No arguments
    
    '''
    def __init__(self):
        pass

    def output_text(self, train_corpus, video = None):
        
        '''
        Purpose: Generate all text present in video using the csv file which contains all the text in the videos
        Input(s):
            train_corpus: The file path to the csv file
            video: A video file whose text is to be generated
        Outputs(s):
            Final description: the text representing a video caption
        
        '''
    
        df = pandas.read_csv(train_corpus)
        if (video):
            video_id,start,end = self.get_video_id(video) # 

            final_description=''
            for i in range(len(df)):

                if df['VideoID'][i]==str(video_id) and df['Start'][i]==int(start) and df['End'][i]==int(end) and df['Language'][i]=='English':

                    final_description=df['Description'][i]
        else:
            
            final_description = []
            for i in range(len(df)):
                if (df['Language'][i]=='English'):
                    final_description.append(df['Description'][i])
        return final_description
            

    def get_video_id(self, video_path):
            
        '''
        Purpose: Extract video name (without extension) and also remove the start and end tags from the video file name
        Input(s): video file path EX: videoname_xx_yy.avi
        Outputs(s): extracted videoname, xx = start tag, yy=end tag
        
        '''
        video_id=None
        start=None
        end=None
        video_path=video_path[0:len(video_path)-4]
        counter=0
        for i in reversed(range(len(video_path))):
            if (video_path[i]=='_' and counter<2):

                if (counter == 0):
                    end=video_path[i+1:]
                    video_path=video_path[0:i]
                else:
                    start=video_path[i+1:]
                    video_path=video_path[0:i]
                counter+=1
        video_id=video_path

        return video_id,start,end
    @staticmethod
    def tagger_input(text):    
            
        '''
        Purpose: Add the beginning of sentence tag on a text
        Input(s): 
            text: a String which represents a sentence from a video
        Outputs(s): 
            text: A tagged String
        
        '''
    
        bos="<bos> "
        text= bos+text 
        return text
    
    @staticmethod
    def tagger_output(text):  
           
        '''
        Purpose: Add the end of sentence tag on a text
        Input(s): 
            text: a String which represents a sentence from a video
        Outputs(s): 
            text: A tagged String
        
        '''
        eos=" <eos>"
        text= text+eos
        
        return text

    @staticmethod
    def clean_text(texts):
            
        '''
        Purpose:Clean text by removing unnecessary characters and altering the format of words.
        Input(s):
            texts: Texts which contain several symbols which aren't used by our model
        Outputs(s):
            texts: Texts which have been cleaned
        
        '''
        for i in range(len(texts)):

            if(texts=="Commands[195]part4 of 9"):
                texts="commands 195 part 4 of 9"

            texts = texts.lower()
            texts = re.sub(r"i'm", "i am", texts)
            texts = re.sub(r"he's", "he is", texts)
            texts = re.sub(r"she's", "she is", texts)
            texts = re.sub(r"it's", "it is", texts)
            texts = re.sub(r"that's", "that is", texts)
            texts = re.sub(r"what's", "that is", texts)
            texts = re.sub(r"where's", "where is", texts)
            texts = re.sub(r"how's", "how is", texts)
            texts = re.sub(r"\'ll", " will", texts)
            texts = re.sub(r"\'ve", " have", texts)
            texts = re.sub(r"\'re", " are", texts)
            texts = re.sub(r"\'d", " would", texts)
            texts = re.sub(r"\'re", " are", texts)
            texts = re.sub(r"won't", "will not", texts)
            texts = re.sub(r"\n","",texts)
            texts = re.sub(r"\r","",texts)
            texts = re.sub(r"_"," ",texts)
            texts = re.sub(r"can't", "cannot", texts)
            texts = re.sub(r"n't", " not", texts)
            texts = re.sub(r"n'", "ng", texts)
            texts = re.sub(r"'bout", "about", texts)
            texts = re.sub(r"'til", "until", texts)
            texts = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,&]", "", texts)

        return texts
    
    def video_to_frames(self, video_path,frame_number, device, INPUT_SIZE , model, transform):
         
        '''
        Purpose: Take a video file and produce coded frames out of it
        Input(s):
            video_path: The video file to be processed
            frame_number: The number of frames we want to extract (In our example, it is 40)
            device: The device on which the inference will be done
            INPUT_SIZE: The dimension of the output array of each frame
            model: The CNN Model used for inference
            transform: The transform object, which will process all images before they are passed to the model
        Outputs(s):
            The coded frames of dimension frame_number X  INPUT_SIZE (Ex: 40 X 2850)
        
        '''
        cap=cv2.VideoCapture(video_path) # read the video file
        number_of_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) # get the number of frames
        get_filter=int(number_of_frames/frame_number) # obtain the factor of video number of frames to the number of frames
        #we want to extract, so that There is equal spacing between the frames which make up the videos 
        
        current_frame=0
        total_features = torch.zeros([frame_number, INPUT_SIZE]) # initialize the total_features 
        total_features.to(dtype = torch.float16)
        t=0
        while (current_frame<number_of_frames):
            ret,frame = cap.read()
            
            if ((current_frame%get_filter) == 0 and t<frame_number):
                with torch.no_grad(): 
                    
                    cv2_im = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) # read the image using OpenCV library
                    frame = Image.fromarray(cv2_im)

                    frame = transform(frame) # Use transform to process the image before inference
                    
                    frame = frame.to(device) # set frame to be inferred using the set device
                    model = model.to(device) # set model to infer using the set device
                    
                    model.eval() # put model in evaluation mode

                    frame_feature = model(frame[None])
                    
                    frame_feature = torch.squeeze(frame_feature,0)
                    
                    total_features[t] = frame_feature

                
                t+=1
            current_frame+=1
            
        cap.release()
        cv2.destroyAllWindows()
        
        return total_features

    def get_pre_data(self, train_dir, frame_number, INPUT_SIZE, model , transform ):
         
        '''
        Purpose: Could be used to obtain the coded frames, and stored in a pickle file, such that training can be faster
        Input(s): 
            train_dir: The directory containing all  video files to be processed
            frame_number: The number of frames we want to extract (In our example, it is 40)
            INPUT_SIZE: The dimension of the output array of each frame
            model: The CNN Model used for inference
            transform: The transform object, which will process all images before they are passed to the model
        Outputs(s):
            All the coded frames in one output
        
        '''
        print(train_dir)
        train_video_list=os.listdir(train_dir) # get list of all files in the train directory
        i=0
        all_output = torch.zeros([len(train_video_list), frame_number, INPUT_SIZE])
        
        for video_path in train_video_list:
            
            video_path=train_dir+video_path
            
            output=self.video_to_frames(video_path,frame_number, 'cuda', INPUT_SIZE, model, transform)
            
            all_output[i] = output
            i += 1
            
        return all_output

    
class TextProcessor:
    '''
    This class contains methods which help in processing text data
    Args: 
        freq_threshold: Get the maximum frequency above which a word is not considered to be part of our vocabulary
        VOCAB_SIZE: the vocabulary size

    '''
    
    def __init__(self, freq_threshold = None, VOCAB_SIZE = None):
        
        self.word_to_index = {"<unk>":0, "<pad>":1, "<bos>": 2, "<eos>": 3}
        self.freq_threshold = freq_threshold
        self.VOCAB_SIZE = VOCAB_SIZE
        self.get_tokenizer = get_tokenizer("basic_english")

    def __len__(self):
        
        return len(self.itos)

    def get_output(self, sentence_to_indices, NUMBER_OF_WORDS):
         
        '''
        Purpose: Generate one - hot representation of sentence, ready for model training
        Input(s): 
            sentence_to_indices: A dictionary which contains the words and indices as key, value pairs
            NUMBER_OF_WORDS: The maximum number of words a sentence can contain
        Outputs(s):
            One-hot vectors stacked into an array
        
        '''
        
        arr = np.zeros((NUMBER_OF_WORDS, self.VOCAB_SIZE))
        pad_number = 1 # The pad in sentence to index is seen as 1
        for i in range(len(arr)):
            if(i<len(sentence_to_indices)):
                arr[i][sentence_to_indices[list(sentence_to_indices.keys())[i]]] = 1 # set a given key to 1, while leaving the others at zero
            else:
                arr[i][pad_number] = 1 # pad to complete the remaining words to make up the NUMBER OF WORDS needed for the model
                
        return arr
    def sentence_to_indices(self, sentence, dictionary):
         
        '''
        Purpose: Take an input sentence and convert it to a dictionary which has words and their corresponding indices in the vocabulary as key, value pairs
        Input(s):
            sentence: The sentence whose words have to be linked to indices
            dictionary: The dictionary which will contain the word to indices
        Outputs(s):
            sentence_to_index: word to index pair dictionary
        
        '''
        
        sentence_to_index = {}
        
        tokenizer = self.get_tokenizer 
        for word in tokenizer(sentence):# go tgrough all the words formed after tokenizing the sentence
            try:
                if dictionary[word] < self.VOCAB_SIZE: # if word is part of vocabulary

                    sentence_to_index[word] = dictionary[word] 
                else: # else it isn't added to the sentence_to_index
                    continue
            except:
                sentence_to_index[word] = 0 # in case the word isn't found in the dictionary, we consider it to be unknown
        return sentence_to_index
    
    
    def vocab_creator(self,sentence_list):
         
        '''
        Purpose: From a give corpus, generate a WORD vocabulary which maps a givenn word to a given index
        Input(s): 
            sentence_list: A corpus of all sentences extracted from the videos in the dataset
        Outputs(s):
            word_to_index: the word to index of all words contained in the textual corpus
        
        '''
        frequencies = {}
        idx = 4
        stoi = {}

        tokenizer = self.get_tokenizer
        for sentence in sentence_list:
            try:
                for word in tokenizer(sentence):
                    if word not in frequencies:
                        frequencies[word] = 1

                    else:
                        frequencies[word] += 1

                    if frequencies[word] == self.freq_threshold:
                        self.word_to_index[word] = idx
                        idx += 1
            except:
                pass 
        return self.word_to_index


In [20]:
class CustomDataset(Dataset):

    def __init__(self, train_dir, train_corpus, device, dictionary, VOCAB_SIZE, NUMBER_OF_WORDS, INPUT_SIZE, number_of_frames, transform, model = None, pre_data = None):
        
        self.train_dir = train_dir
        self.train_dir_list = os.listdir(train_dir)
        self.model = model
        self.transform = transform
        self.number_of_frames = number_of_frames
        self.utils = Utils()
        self.word_to_index = dictionary
        self.VOCAB_SIZE = VOCAB_SIZE
        self.NUMBER_OF_WORDS = NUMBER_OF_WORDS
        self.INPUT_SIZE = INPUT_SIZE
        self.pre_data = pre_data
        self.device = device
        self.train_corpus = train_corpus
        
    def __len__(self):
        return len(self.train_dir_list)
    

    def __getitem__(self, idx):
        
        textprocessor = TextProcessor(VOCAB_SIZE = self.VOCAB_SIZE)
        utils = Utils()
        
        
        video_file = self.train_dir_list[idx] # get video file corresponding to the id, idx
        
        
        output_text = self.utils.output_text(self.train_corpus, video_file) # get the text contained in the video file
        
        
        #### generate input 2,  from the output_text
        sentence_to_index = textprocessor.sentence_to_indices(utils.tagger_input(utils.clean_text(output_text)), self.word_to_index)
        X_2 = textprocessor.get_output(sentence_to_index, NUMBER_OF_WORDS)
        
        #### generate output,  from the output_text
        sentence_to_index = textprocessor.sentence_to_indices(utils.tagger_output(utils.clean_text(output_text)), self.word_to_index) 
        y = textprocessor.get_output(sentence_to_index, NUMBER_OF_WORDS)
        
        video_path = self.train_dir + video_file
        
        # generate input 1
        X_1 = utils.video_to_frames(video_path, self.number_of_frames, self.device, self.INPUT_SIZE, self.model, self.transform)
        #X_1 = pre_data[idx]
        return (X_1,torch.tensor(X_2)), torch.tensor(y)

In [21]:
model_vgg = models.vgg16(pretrained=True)# obtain pretrained VGG16 model


In [22]:
model_vgg.classifier = nn.Sequential(*list(model_vgg.classifier.children())[:-2]) # remove last linear layer of VGG16 model


In [23]:
### parametres

LEARNING_RATE = 1e-3
NUMBER_OF_FRAMES = 40
BATCH_SIZE = 1
EPOCH = 10
TRAINING_DEVICE = 'cuda'
VOCAB_SIZE = 200
NUMBER_OF_WORDS = 10
HIDDEN_SIZE = 300
INPUT_SIZE = 4096
NUMBER_OF_LAYERS = 1
tsfm = transforms.Compose([
    transforms.Resize([224, 224]),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
train_dir = 'D:/Machine_Learning/datasets/YouTubeClips_2/YouTubeClips/'
train_corpus = 'D:/Machine_Learning/datasets/video_corpus/video_corpus.csv'
utils = Utils()
all_text = utils.output_text(train_corpus)
text_processor = TextProcessor(freq_threshold = 10)
dictionary = text_processor.vocab_creator(all_text)


In [24]:
### training data preparation
train_ds = CustomDataset(train_dir, train_corpus, device, dictionary, VOCAB_SIZE, NUMBER_OF_WORDS, INPUT_SIZE,  NUMBER_OF_FRAMES, tsfm, model = model_vgg)
train_dl = DataLoader(train_ds, batch_size = BATCH_SIZE)

In [33]:
### Sequence to sequence model

class Encoder_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(Encoder_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        out, (h_n, c_n) = self.lstm(x)  # out: tensor of shape (batch_size, seq_length, hidden_size)
                                    
        return h_n, c_n
    
class Decoder_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, number_of_words):
        super(Decoder_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, input_size)
    def forward(self, x, h_n, c_n):
        output, _ = self.lstm(x.float(),(h_n,c_n))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        output = self.fc(output)                            
        
        return output
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, X_1, X_2):
        h_n, c_n = self.encoder(X_1)
        output = self.decoder(X_2, h_n, c_n)
        return output
    
encoder = Encoder_LSTM(input_size = INPUT_SIZE, hidden_size = HIDDEN_SIZE , num_layers = NUMBER_OF_LAYERS)
decoder = Decoder_LSTM(input_size = VOCAB_SIZE, hidden_size = HIDDEN_SIZE , num_layers = NUMBER_OF_LAYERS, number_of_words = NUMBER_OF_WORDS)
model_seq_to_seq = Seq2Seq(encoder, decoder).to(device)
model = model_seq_to_seq
print(model)
### load the state_dict of model if model has been pretrained.
model.load_state_dict(torch.load('model_lstm_best_loss.pth'))

Seq2Seq(
  (encoder): Encoder_LSTM(
    (lstm): LSTM(4096, 300, batch_first=True)
  )
  (decoder): Decoder_LSTM(
    (lstm): LSTM(200, 300, batch_first=True)
    (fc): Linear(in_features=300, out_features=200, bias=True)
  )
)


<All keys matched successfully>

In [34]:
### optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

In [35]:
#### Model Training



EPOCH = 10
import time
print_feq = 100
best_loss = np.inf
for epoch in range(1, EPOCH+1):
    model.train()
    epoch_loss = 0
    
    for step, (img,label) in enumerate(train_dl):
        
        
        time_1 = time.time() ## timing
        
        X_1, X_2 = img ### get inputs
        
        X_1 = X_1.to(device) # Set device 
        X_2 = X_2.to(device) # Set device
        
        
        label = label.to(device) # Set output device
        
        ### zero the parameter gradients
        optimizer.zero_grad()
        
        ### forward
        prediction = model(X_1, X_2)
        
        ### Optimize
        prediction = prediction.to(device)
        prediction = torch.squeeze(prediction,0)
        label = torch.squeeze(label,0)
        
        new_label = torch.zeros([label.shape[0]])
        for l in range(label.shape[0]):
            new_label[l] = np.argmax(label[l].cpu())
        new_label = new_label.to(device)
        loss = criterion(prediction, new_label.long())
        
        # Backward prop.
        loss.backward()
        optimizer.step()
        
        ### print out statistics
        epoch_loss += loss.item()
        if step % print_feq == 0:
            print('epoch:', epoch,
                  '\tstep:', step+1, '/', len(train_dl) + 1,
                  '\ttrain loss:', '{:.4f}'.format(loss.item()),
                  '\ttime:', '{:.4f}'.format((time.time()-time_1)*print_feq), 's')
        torch.save(model.state_dict(), 'model_lstm_2.pth')
    ### save best model
    if(epoch_loss < best_loss):
        best_loss = epoch_loss
        torch.save(model.state_dict(), 'model_lstm_best_loss.pth')
    print("The loss for this epoch is = :", epoch_loss/lent(train_dl))

D:/Machine_Learning/datasets/YouTubeClips_2/YouTubeClips/-4wsuPCjDBc_5_15.avi
when we getfilter = : 7
epoch: 1 	step: 1 / 1001 	train loss: 0.9817 	time: 1.2964 s
D:/Machine_Learning/datasets/YouTubeClips_2/YouTubeClips/-7KMZQEsJW4_205_208.avi
when we getfilter = : 1


KeyboardInterrupt: 

In [55]:
#### Model Testing
model.eval();
from random import randint
import matplotlib.pyplot as plt

utils = Utils()

video_path = 'D:/Machine_Learning/datasets/YouTubeClips_2/validation/NFxWwI0J3As_78_84.avi'

#print(video_path)
video_pre_data = utils.video_to_frames(video_path,frame_number = NUMBER_OF_FRAMES, device = 'cuda', INPUT_SIZE = INPUT_SIZE , model = model_vgg, transform = tsfm)
print(video_pre_data.shape)
print(video_pre_data)
X_2  = torch.zeros([NUMBER_OF_WORDS,VOCAB_SIZE])

for i in range(NUMBER_OF_WORDS):
    if (i == 0):
        
        X_2[i][2] = 1
    else:
        X_2[i][1] = 1

input_data = video_pre_data.unsqueeze(0)# pre_data[200].unsqueeze(0)

final_sentence = []

X_2 = X_2.unsqueeze(0)
X_2 = X_2.to(device)
input_data = input_data.to(device)


D:/Machine_Learning/datasets/YouTubeClips_2/validation/NFxWwI0J3As_78_84.avi
when we getfilter = : 3
torch.Size([40, 4096])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0761, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.1460,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1150,  ..., 0.0000, 0.0000, 0.0000]])


In [56]:

for i in range(NUMBER_OF_WORDS-1):
    with torch.no_grad():
        predicted = model(input_data, X_2)
        predicted = predicted.squeeze(0)
        #print(torch.argmax(predicted[i]))
        final_sentence.append(next((key for key, value in dictionary.items() if value == torch.argmax(predicted[i])), None))
        X_2[0][i+1][torch.argmax(predicted[i])] = 1
        X_2[0][i+1][1] = 0
print(final_sentence)

['a', 'man', 'is', 'playing', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>']


In [None]:
_o1UXSxTjfo_68_80

video_path = 'D:/Machine_Learning/datasets/YouTubeClips_2/validation/IAvBB2lv8iw_142_148.avi'