# Key Frames Extraction

In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
%matplotlib inline
import cv2
import datetime
import torch
import tensorflow as tf
import os
from PIL import Image

In [None]:
cap = cv2.VideoCapture('v18.mp4') 

arr = np.empty((0, 1944), int)   #initializing 1944 dimensional array to store 'flattened' color histograms
D=dict()   #to store the original frame (array)
count=0    #counting the number of frames
start_time = time.time()
while cap.isOpened():
    
    # Read the video file.
    ret, frame = cap.read()
    
    # If we got frames.
    if ret == True:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  #since cv reads frame in bgr order so rearraning to get frames in rgb order
        D[count] = frame_rgb   #storing each frame (array) to D , so that we can identify key frames later 
        
        #dividing a frame into 3*3 i.e 9 blocks
        height, width, channels = frame_rgb.shape

        if height % 3 == 0:
            h_chunk = int(height/3)
        else:
            h_chunk = int(height/3) + 1

        if width % 3 == 0:
            w_chunk = int(width/3)
        else:
            w_chunk = int(width/3) + 1

        h=0
        w= 0 
        feature_vector = []
        for a in range(1,4):
            h_window = h_chunk*a
            for b in range(1,4):
                frame = frame_rgb[h : h_window, w : w_chunk*b , :]
                hist = cv2.calcHist(frame, [0, 1, 2], None, [6, 6, 6], [0, 256, 0, 256, 0, 256])#finding histograms for each block  
                hist1= hist.flatten()  #flatten the hist to one-dimensinal vector 
                feature_vector += list(hist1)
                w = w_chunk*b
                
            h = h_chunk*a
            w= 0

                
        arr =np.vstack((arr, feature_vector )) #appending each one-dimensinal vector to generate N*M matrix (where N is number of frames
          #and M is 1944) 
        count+=1
    else:
        break

print("--- %s seconds ---" % (time.time() - start_time))

final_arr = arr.transpose() #transposing so that i will have all frames in columns i.e M*N dimensional matrix 
#where M is 1944 and N is number of frames
print(final_arr.shape)
print(count)

--- 5.644730567932129 seconds ---
(1944, 570)
570


In [None]:
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds, eigs
A = csc_matrix(final_arr, dtype=float)

#top 63 singular values from 76082 to 508
u, s, vt = svds(A, k = 63)

In [None]:
print(u.shape, s.shape, vt.shape)

(1944, 63) (63,) (63, 570)


In [None]:
print(list(s))

[46.037526249144136, 46.56319905092684, 46.832172190903506, 47.782836063356434, 48.76244153710375, 49.95496799661359, 50.854404855036805, 52.416194324050664, 53.57531537139949, 55.37421639070441, 55.865763166357524, 57.61659739349478, 58.71967948501821, 61.05288377957271, 62.936405947657775, 63.90003166713214, 66.12001789399193, 68.71991574223081, 69.99154125925496, 71.98601857584845, 73.65127928891839, 74.1959782503641, 77.32922820770243, 78.43901293025912, 83.26829984390396, 87.27639540882306, 87.85312400511836, 90.11493849664488, 92.20384530979874, 95.79652962785829, 107.79924641938736, 108.67886587428218, 112.96949701101029, 114.42932887513881, 119.2193755803186, 122.98501149796529, 130.67668419370358, 141.1236938303202, 147.6376899468303, 156.3250037615294, 164.44946816822937, 176.76388834430236, 191.3242465275119, 193.95119064545653, 207.51372402542734, 220.0569253254723, 252.54846635708228, 268.74338828150087, 306.8042303688005, 386.30061925944796, 414.34767763880654, 422.512877

In [None]:
v1_t = vt.transpose()

projections = v1_t @ np.diag(s) #the column vectors i.e the frame histogram data has been projected onto the orthonormal basis 
#formed by vectors of the left singular matrix u .The coordinates of the frames in this space are given by v1_t @ np.diag(s)
#So we can see that , now we need only 63 dimensions to represent each column/frame 
print(projections.shape)

(570, 63)


In [None]:
#dynamic clustering of projected frame histograms to find which all frames are similar i.e make shots
f=projections
C = dict() #to store frames in respective cluster
for i in range(f.shape[0]):
    C[i] = np.empty((0,63), int)
    
#adding first two projected frames in first cluster i.e Initializaton    
C[0] = np.vstack((C[0], f[0]))   
C[0] = np.vstack((C[0], f[1]))

E = dict() #to store centroids of each cluster
for i in range(projections.shape[0]):
    E[i] = np.empty((0,63), int)
    
E[0] = np.mean(C[0], axis=0) #finding centroid of C[0] cluster

count = 0
for i in range(2,f.shape[0]):
    similarity = np.dot(f[i], E[count])/( (np.dot(f[i],f[i]) **.5) * (np.dot(E[count], E[count]) ** .5)) #cosine similarity
    #this metric is used to quantify how similar is one vector to other. The maximum value is 1 which indicates they are same
    #and if the value is 0 which indicates they are orthogonal nothing is common between them.
    #Here we want to find similarity between each projected frame and last cluster formed chronologically. 
     
    
    if similarity < 0.9: #if the projected frame and last cluster formed  are not similar upto 0.9 cosine value then 
                         #we assign this data point to newly created cluster and find centroid 
                         #We checked other thresholds also like 0.85, 0.875, 0.95, 0.98
                        #but 0.9 looks okay because as we go below then we get many key-frames for similar event and 
                        #as we go above we have lesser number of key-frames thus missed some events. So, 0.9 seems optimal.
                        
        count+=1         
        C[count] = np.vstack((C[count], f[i])) 
        E[count] = np.mean(C[count], axis=0)   
    else:  #if they are similar then assign this data point to last cluster formed and update the centroid of the cluster
        C[count] = np.vstack((C[count], f[i])) 
        E[count] = np.mean(C[count], axis=0)         

In [None]:
b = []  #find the number of data points in each cluster formed.

#We can assume that sparse clusters indicates 
#transition between shots so we will ignore these frames which lies in such clusters and wherever the clusters are densely populated indicates they form shots
#and we can take the last element of these shots to summarise that particular shot

for i in range(f.shape[0]):
    b.append(C[i].shape[0])

last = b.index(0)  #where we find 0 in b indicates that all required clusters have been formed , so we can delete these from C
b1=b[:last ] #The size of each cluster.

In [None]:
res = [idx for idx, val in enumerate(b1) if val >= 25] #so i am assuming any dense cluster with atleast 25 frames is eligible to 
#make shot.
print(len(res)) #so total 25 shots with 46 (71-25) cuts

4


In [None]:
GG = C #copying the elements of C to GG, the purpose of  the below code is to label each cluster so later 
#it would be easier to identify frames in each cluster
for i in range(last):
    p1= np.repeat(i, b1[i]).reshape(b1[i],1)
    GG[i] = np.hstack((GG[i],p1))

In [None]:
#the purpose of the below code is to append each cluster to get multidimensional array of dimension N*64, N is number of frames
F=  np.empty((0,64), int) 
for i in range(last):
    F = np.vstack((F,GG[i]))

In [None]:
#converting F (multidimensional array)  to dataframe

colnames = []
for i in range(1, 65):
    col_name = "v" + str(i)
    colnames+= [col_name]
print(colnames)

df = pd.DataFrame(F, columns= colnames)

['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10', 'v11', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20', 'v21', 'v22', 'v23', 'v24', 'v25', 'v26', 'v27', 'v28', 'v29', 'v30', 'v31', 'v32', 'v33', 'v34', 'v35', 'v36', 'v37', 'v38', 'v39', 'v40', 'v41', 'v42', 'v43', 'v44', 'v45', 'v46', 'v47', 'v48', 'v49', 'v50', 'v51', 'v52', 'v53', 'v54', 'v55', 'v56', 'v57', 'v58', 'v59', 'v60', 'v61', 'v62', 'v63', 'v64']


In [None]:
df['v64']= df['v64'].astype(int)  #converting the cluster level from float type to i

In [None]:
df1 =  df[df.v64.isin(res)]   #filter only those frames which are eligible to be a part of shot or filter those frames who are
#part of required clusters that have more than 25 frames in it

In [None]:
new = df1.groupby('v64').tail(1)['v64'] #For each cluster /group take its last element which summarize the shot i.e key-frame

In [None]:
new1 = new.index #finding key-frames (frame number so that we can go back get the original picture)

In [None]:
#output the frames in png format
images= []
for c in new1:
    frame_rgb1 = cv2.cvtColor(D[c], cv2.COLOR_RGB2BGR) #since cv consider image in BGR order
    frame_num_chr = str(c)
    file_name = 'frame'+ frame_num_chr +'.png'
    images.append(frame_num_chr)
    cv2.imwrite(file_name, frame_rgb1)

In [None]:
fileNames=[]
for c in new1:
  frame_rgb1 = cv2.cvtColor(D[c], cv2.COLOR_RGB2BGR) #since cv consider image in BGR order
  frame_num_chr = str(c)
  fileNames.append('frame'+ frame_num_chr)
fileNames

['frame157', 'frame362', 'frame516', 'frame569']

In [None]:
fps=cap.get(cv2.CAP_PROP_FPS)

In [None]:
print("Frames per second using cap.get(cv2.CAP_PROP_FPS) : {0}".format(fps))

Frames per second using cap.get(cv2.CAP_PROP_FPS) : 30.0


In [None]:
frame_duration = (1/fps)
frame_duration

0.03333333333333333

In [None]:
diccionario = {}
contador=0
for k in new1:
  a1=[]
  a1.append(datetime.datetime.fromtimestamp(round(frame_duration*k, 2)).strftime('%H:%M:%S:%f'))
  diccionario[fileNames[contador]] = a1
  contador=contador + 1

In [None]:
diccionario

{'frame157': ['00:00:05:230000'],
 'frame362': ['00:00:12:070000'],
 'frame516': ['00:00:17:200000'],
 'frame569': ['00:00:18:970000']}

# Crops with Object Detection

In [None]:
# Model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # or yolov5n - yolov5x6, custom

  "You are about to download and run code from an untrusted repository. In a future release, this won't "
Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to /root/.cache/torch/hub/master.zip
INFO:yolov5:[31m[1mrequirements:[0m YOLOv5 requirement "ipython" not found, attempting AutoUpdate...
[31m[1mrequirements:[0m YOLOv5 requirement "ipython" not found, attempting AutoUpdate...
INFO:yolov5:Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi
Successfully installed jedi-0.18.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi
Successfully installed jedi-0.18.1

INFO:yolov5:[31m[1mrequirements:[0m 1 package updated per /root/.cache/torch/hub

  0%|          | 0.00/14.1M [00:00<?, ?B/s]

INFO:yolov5:

INFO:yolov5:Fusing layers... 
Fusing layers... 
INFO:yolov5:YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
INFO:yolov5:Adding AutoShape... 
Adding AutoShape... 


In [None]:
if os.path.isdir('data'):
  %rm -rf 'data'

for img in diccionario:
  img_path = (img+'.png')
  result = model(img_path)
  dir = result.crop(save=True, save_dir='data/'+img)

  if os.path.isdir('data/'+img+'/crops/person'):
    diccionario[img].append(os.listdir('data/'+img+'/crops/person'))

INFO:yolov5:Saved 1 image to [1mdata/frame157[0m
Saved 1 image to [1mdata/frame157[0m
INFO:yolov5:Saved results to data/frame157

Saved results to data/frame157

INFO:yolov5:Saved 1 image to [1mdata/frame362[0m
Saved 1 image to [1mdata/frame362[0m
INFO:yolov5:Saved results to data/frame362

Saved results to data/frame362

INFO:yolov5:Saved 1 image to [1mdata/frame516[0m
Saved 1 image to [1mdata/frame516[0m
INFO:yolov5:Saved results to data/frame516

Saved results to data/frame516

INFO:yolov5:Saved 1 image to [1mdata/frame569[0m
Saved 1 image to [1mdata/frame569[0m
INFO:yolov5:Saved results to data/frame569

Saved results to data/frame569



In [None]:
# Ejemplo de keyframe en diccionario con crops
diccionario['frame157']

['00:00:05:230000',
 ['frame1574.jpg',
  'frame1575.jpg',
  'frame157.jpg',
  'frame1579.jpg',
  'frame1577.jpg',
  'frame1573.jpg',
  'frame1578.jpg',
  'frame1576.jpg',
  'frame1572.jpg']]

# Image Classification

In [None]:
!unzip -o saved_model.zip

unzip:  cannot find or open saved_model.zip, saved_model.zip.zip or saved_model.zip.ZIP.


In [None]:
model = tf.keras.models.load_model('content/saved_model/my_model')

OSError: ignored

In [None]:
image_size = (180, 180)

img = tf.keras.preprocessing.image.load_img("/content/data/frame157/crops/person/frame1572.jpg", target_size=image_size)

In [None]:
img

In [None]:
img_array = tf.keras.preprocessing.image.img_to_array(img)
img_array = tf.expand_dims(img_array, 0)  # Create batch axis

predictions = model.predict(img_array)
score = predictions[0]
print(score)
print("This image is %.2f percent raising_hand and %.2f percent sleeping."
    % (100 * (1 - score), 100 * score))

[    0.99992]
This image is 0.01 percent raising_hand and 99.99 percent sleeping.


# Inference notenook for [CLIP prefix captioning](https://github.com/rmokady/CLIP_prefix_caption/)

Disclaimer: the authors do not own any rights for the code or data.

In [None]:
#@title Install
!pip install transformers
! pip install git+https://github.com/openai/CLIP.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-z8wsjlpn
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-z8wsjlpn


In [None]:
#@title Drive Downloader

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

download_with_pydrive = True #@param {type:"boolean"}  

class Downloader(object):
    def __init__(self, use_pydrive):
        self.use_pydrive = use_pydrive

        if self.use_pydrive:
            self.authenticate()
        
    def authenticate(self):
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        self.drive = GoogleDrive(gauth)
    
    def download_file(self, file_id, file_dst):
        if self.use_pydrive:
            downloaded = self.drive.CreateFile({'id':file_id})
            downloaded.FetchMetadata(fetch_all=True)
            downloaded.GetContentFile(file_dst)
        else:
            !gdown --id $file_id -O $file_dst

downloader = Downloader(download_with_pydrive)

In [None]:
#@title Imports

import clip
import os
from torch import nn
import numpy as np
import torch
import torch.nn.functional as nnf
import sys
from typing import Tuple, List, Union, Optional
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
from google.colab import files
import skimage.io as io
import PIL.Image
from IPython.display import Image 


N = type(None)
V = np.array
ARRAY = np.ndarray
ARRAYS = Union[Tuple[ARRAY, ...], List[ARRAY]]
VS = Union[Tuple[V, ...], List[V]]
VN = Union[V, N]
VNS = Union[VS, N]
T = torch.Tensor
TS = Union[Tuple[T, ...], List[T]]
TN = Optional[T]
TNS = Union[Tuple[TN, ...], List[TN]]
TSN = Optional[TS]
TA = Union[T, ARRAY]


D = torch.device
CPU = torch.device('cpu')


def get_device(device_id: int) -> D:
    if not torch.cuda.is_available():
        return CPU
    device_id = min(torch.cuda.device_count() - 1, device_id)
    return torch.device(f'cuda:{device_id}')


CUDA = get_device

current_directory = os.getcwd()
save_path = os.path.join(os.path.dirname(current_directory), "pretrained_models")
os.makedirs(save_path, exist_ok=True)
model_path = os.path.join(save_path, 'model_wieghts.pt')


In [None]:
#@title Model

class MLP(nn.Module):

    def forward(self, x: T) -> T:
        return self.model(x)

    def __init__(self, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
        super(MLP, self).__init__()
        layers = []
        for i in range(len(sizes) -1):
            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
            if i < len(sizes) - 2:
                layers.append(act())
        self.model = nn.Sequential(*layers)


class ClipCaptionModel(nn.Module):

    #@functools.lru_cache #FIXME
    def get_dummy_token(self, batch_size: int, device: D) -> T:
        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)

    def forward(self, tokens: T, prefix: T, mask: Optional[T] = None, labels: Optional[T] = None):
        embedding_text = self.gpt.transformer.wte(tokens)
        prefix_projections = self.clip_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)
        #print(embedding_text.size()) #torch.Size([5, 67, 768])
        #print(prefix_projections.size()) #torch.Size([5, 1, 768])
        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
            labels = torch.cat((dummy_token, tokens), dim=1)
        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out

    def __init__(self, prefix_length: int, prefix_size: int = 512):
        super(ClipCaptionModel, self).__init__()
        self.prefix_length = prefix_length
        self.gpt = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
        if prefix_length > 10:  # not enough memory
            self.clip_project = nn.Linear(prefix_size, self.gpt_embedding_size * prefix_length)
        else:
            self.clip_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2, self.gpt_embedding_size * prefix_length))


class ClipCaptionPrefix(ClipCaptionModel):

    def parameters(self, recurse: bool = True):
        return self.clip_project.parameters()

    def train(self, mode: bool = True):
        super(ClipCaptionPrefix, self).train(mode)
        self.gpt.eval()
        return self

In [None]:
#@title Caption prediction

def generate_beam(model, tokenizer, beam_size: int = 5, prompt=None, embed=None,
                  entry_length=67, temperature=1., stop_token: str = '.'):

    model.eval()
    stop_token_index = tokenizer.encode(stop_token)[0]
    tokens = None
    scores = None
    device = next(model.parameters()).device
    seq_lengths = torch.ones(beam_size, device=device)
    is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
    with torch.no_grad():
        if embed is not None:
            generated = embed
        else:
            if tokens is None:
                tokens = torch.tensor(tokenizer.encode(prompt))
                tokens = tokens.unsqueeze(0).to(device)
                generated = model.gpt.transformer.wte(tokens)
        for i in range(entry_length):
            outputs = model.gpt(inputs_embeds=generated)
            logits = outputs.logits
            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
            logits = logits.softmax(-1).log()
            if scores is None:
                scores, next_tokens = logits.topk(beam_size, -1)
                generated = generated.expand(beam_size, *generated.shape[1:])
                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
                if tokens is None:
                    tokens = next_tokens
                else:
                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
                    tokens = torch.cat((tokens, next_tokens), dim=1)
            else:
                logits[is_stopped] = -float(np.inf)
                logits[is_stopped, 0] = 0
                scores_sum = scores[:, None] + logits
                seq_lengths[~is_stopped] += 1
                scores_sum_average = scores_sum / seq_lengths[:, None]
                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
                next_tokens_source = next_tokens // scores_sum.shape[1]
                seq_lengths = seq_lengths[next_tokens_source]
                next_tokens = next_tokens % scores_sum.shape[1]
                next_tokens = next_tokens.unsqueeze(1)
                tokens = tokens[next_tokens_source]
                tokens = torch.cat((tokens, next_tokens), dim=1)
                generated = generated[next_tokens_source]
                scores = scores_sum_average * seq_lengths
                is_stopped = is_stopped[next_tokens_source]
            next_token_embed = model.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
            generated = torch.cat((generated, next_token_embed), dim=1)
            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
            if is_stopped.all():
                break
    scores = scores / seq_lengths
    output_list = tokens.cpu().numpy()
    output_texts = [tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
    order = scores.argsort(descending=True)
    output_texts = [output_texts[i] for i in order]
    return output_texts


def generate2(
        model,
        tokenizer,
        tokens=None,
        prompt=None,
        embed=None,
        entry_count=1,
        entry_length=67,  # maximum number of words
        top_p=0.8,
        temperature=1.,
        stop_token: str = '.',
):
    model.eval()
    generated_num = 0
    generated_list = []
    stop_token_index = tokenizer.encode(stop_token)[0]
    filter_value = -float("Inf")
    device = next(model.parameters()).device

    with torch.no_grad():

        for entry_idx in trange(entry_count):
            if embed is not None:
                generated = embed
            else:
                if tokens is None:
                    tokens = torch.tensor(tokenizer.encode(prompt))
                    tokens = tokens.unsqueeze(0).to(device)

                generated = model.gpt.transformer.wte(tokens)

            for i in range(entry_length):

                outputs = model.gpt(inputs_embeds=generated)
                logits = outputs.logits
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(nnf.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                                                    ..., :-1
                                                    ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value
                next_token = torch.argmax(logits, -1).unsqueeze(0)
                next_token_embed = model.gpt.transformer.wte(next_token)
                if tokens is None:
                    tokens = next_token
                else:
                    tokens = torch.cat((tokens, next_token), dim=1)
                generated = torch.cat((generated, next_token_embed), dim=1)
                if stop_token_index == next_token.item():
                    break

            output_list = list(tokens.squeeze().cpu().numpy())
            output_text = tokenizer.decode(output_list)
            generated_list.append(output_text)

    return generated_list[0]

In [None]:
#@title Choose pretrained model - COCO or Coneptual captions


pretrained_model = 'Conceptual captions'  # @param ['COCO', 'Conceptual captions']

if pretrained_model == 'Conceptual captions':
  downloader.download_file("14pXWwB4Zm82rsDdvbGguLfx9F8aM7ovT", model_path)
else:
  downloader.download_file("1IdaBtMSvtyzF0ByVaBHtvM0JYSXRExRX", model_path)

In [None]:
#@title GPU/CPU


is_gpu = True #@param {type:"boolean"}  


In [None]:
#@title CLIP model + GPT2 tokenizer

device = CUDA(0) if is_gpu else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
#@title Load model weights


prefix_length = 10

model = ClipCaptionModel(prefix_length)

model.load_state_dict(torch.load(model_path, map_location=CPU)) 

model = model.eval() 
device = CUDA(0) if is_gpu else "cpu"
model = model.to(device)


In [None]:
'frame'+ images[0] +'.png'

'frame157.png'

In [None]:
'''#@title Or download random samples form COCO test set (Karpathy et al. split)

IMAGE_NAME = '354533'  # @param ['562207', '579664', '060623', '165547', '334321', '483108', '386164', '354533']

name_ = "COCO_val2014_000000" + IMAGE_NAME + ".jpg"
images_path = os.path.join(os.path.dirname(current_directory), "images")
os.makedirs(images_path, exist_ok=True)
UPLOADED_FILE = os.path.join(images_path, name_)

if not os.path.isfile(UPLOADED_FILE):
  download_path = os.path.join(images_path, "images.zip")
  downloader.download_file("1BwJeBME-dpwcCT8IXYeWz7uaPkbexjNB", download_path)

  !unzip {download_path} -d {images_path}

'''

'#@title Or download random samples form COCO test set (Karpathy et al. split)\n\nIMAGE_NAME = \'354533\'  # @param [\'562207\', \'579664\', \'060623\', \'165547\', \'334321\', \'483108\', \'386164\', \'354533\']\n\nname_ = "COCO_val2014_000000" + IMAGE_NAME + ".jpg"\nimages_path = os.path.join(os.path.dirname(current_directory), "images")\nos.makedirs(images_path, exist_ok=True)\nUPLOADED_FILE = os.path.join(images_path, name_)\n\nif not os.path.isfile(UPLOADED_FILE):\n  download_path = os.path.join(images_path, "images.zip")\n  downloader.download_file("1BwJeBME-dpwcCT8IXYeWz7uaPkbexjNB", download_path)\n\n  !unzip {download_path} -d {images_path}\n\n'

Conceptual captions examples:
https://drive.google.com/file/d/1mzH3b0LQrGEWjEva4hI6HE_fIYRIgtBT/view?usp=sharing

In [None]:
#@title Inference
def creating_sentence(UPLOADED_FILE):
  use_beam_search = False #@param {type:"boolean"}  

  image = io.imread(UPLOADED_FILE)
  pil_image = PIL.Image.fromarray(image)
  #pil_img = Image(filename=UPLOADED_FILE)
  '''display(pil_image)'''

  image = preprocess(pil_image).unsqueeze(0).to(device)
  with torch.no_grad():
      # if type(model) is ClipCaptionE2E:
      #     prefix_embed = model.forward_image(image)
      # else:
      prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
      prefix_embed = model.clip_project(prefix).reshape(1, prefix_length, -1)
  if use_beam_search:
      generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0]
  else:
      generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed)

  return(generated_text_prefix)
  print('\n')
  print(generated_text_prefix)

In [None]:
print(diccionario)
videoSentences = []
for i in images:
  file_image = 'frame'+ i +'.png'
  file_image2 = 'frame'+ i 
  print('frame'+ i +'.png')
  diccionario[file_image2].append(creating_sentence(file_image))
  videoSentences.append(creating_sentence(file_image))
videoSentences
print(diccionario)

{'frame157': ['00:00:05:230000'], 'frame362': ['00:00:12:070000'], 'frame516': ['00:00:17:200000'], 'frame569': ['00:00:18:970000']}
frame157.png


100%|██████████| 1/1 [00:00<00:00,  2.27it/s]
100%|██████████| 1/1 [00:00<00:00,  3.10it/s]


frame362.png


100%|██████████| 1/1 [00:00<00:00,  5.22it/s]
100%|██████████| 1/1 [00:00<00:00,  5.71it/s]


frame516.png


100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
100%|██████████| 1/1 [00:00<00:00,  1.64it/s]


frame569.png


100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
100%|██████████| 1/1 [00:00<00:00,  3.33it/s]

{'frame157': ['00:00:05:230000', 'person, a teacher, gives a lesson to his class.'], 'frame362': ['00:00:12:070000', 'person, a teacher, teaches students.'], 'frame516': ['00:00:17:200000', 'person, left, and person, right, are among the students who were suspended from the school.'], 'frame569': ['00:00:18:970000', 'person talks to a class of high school students.']}





In [None]:
keys = diccionario.keys()
keys = list(keys)
a= []
start = "00:00:00.00"
for i in range(len(diccionario)):
    b= []
    b.append(i)
    b.append(f"{start} --> {diccionario[keys[i]][0]}")
    b.append(f"{diccionario[keys[i]][1]}")
    b.append("")
    start = diccionario[keys[i]][0]
    a.append(b)
print(a)


[[0, '00:00:00.00 --> 00:00:05:230000', 'person, a teacher, gives a lesson to his class.', ''], [1, '00:00:05:230000 --> 00:00:12:070000', 'person, a teacher, teaches students.', ''], [2, '00:00:12:070000 --> 00:00:17:200000', 'person, left, and person, right, are among the students who were suspended from the school.', ''], [3, '00:00:17:200000 --> 00:00:18:970000', 'person talks to a class of high school students.', '']]


In [None]:

with open('test.srt', 'w') as filehandle:
    for i in a:
        for j in i:
            filehandle.write(f'{j}\n')


# Summarization

In [None]:
test = [
    'person, a teacher, gives a lesson to his class.',
    'There are 2 people sleeping '
    'There are 3 people raising hand '
    'person, a teacher, teaches students.', 
    'There are 1 students sleeping in the classroom'
    'There are 2 students raising hand in the classroom'
    'person, left, and person, right, are among the students who were suspended from the school.', 
    'person talks to a class of high school students.'
    'There are 2 people sleeping '
    'There are 3 people raising hand '
    ]

In [None]:
text = ' '.join(videoSentences)

In [None]:
print(text)

person, a teacher, gives a lesson to his class. There are 2 people sleeping There are 3 people raising hand person, a teacher, teaches students. There are 1 students sleeping in the classroomThere are 2 students raising hand in the classroomperson, left, and person, right, are among the students who were suspended from the school. person talks to a class of high school students.There are 2 people sleeping There are 3 people raising hand 


In [None]:
from gensim.summarization.summarizer import summarize
print(summarize(text, word_count=30))



There are 2 people sleeping There are 3 people raising hand person, a teacher, teaches students.
person talks to a class of high school students.There are 2 people sleeping There are 3 people raising hand 
