In [1]:
# import torch and other libraries
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image
from IPython.display import display # to display images

print(torch.__version__)

0.3.1.post2


In [2]:
# other imports
import time
import numpy as np
import pickle

In [3]:
# Load the pretrained model
resnet_model = models.resnet18(pretrained=True)
alexnet_model = models.alexnet(pretrained=True)
# Use the model object to select the desired layer
resnet_layer = resnet_model._modules.get('avgpool')
alexnet_layer = alexnet_model._modules.get('classifier')

In [4]:
resnet_model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1), ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (b

In [30]:
# Set model to evaluation mode
resnet_model.eval()
alexnet_model.eval()

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace)
    (2): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace)
    (5): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Dropout(p=0.5)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace)
    (3): Dropout(p=0.5)
    (4): Linear(in_features=4096,

In [31]:
# image scaler to 224 x 224 pixels
scaler = transforms.Scale((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

In [32]:
def get_vector(image_name, model):
    # 1. Load the image with Pillow library
    img = Image.open(image_name)
    # 2. Create a PyTorch Variable with the transformed image
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    if model == 'resnet':
        # 3. Create a vector of zeros that will hold our feature vector
        #    The 'avgpool' layer has an output size of 512
        my_embedding = torch.zeros(512)
        # 4. Define a function that will copy the output of a layer
        def copy_data(m, i, o):
            my_embedding.copy_(o.data)
        # 5. Attach that function to our selected layer
        h = resnet_layer.register_forward_hook(copy_data)
        # 6. Run the model on our transformed image
        resnet_model(t_img)
        h.remove()
    elif model == 'alexnet':
        print('using alexnet...')
        # 3. Create a vector of zeros that will hold our feature vector
        #    The 'classifier' layer has an output size of 1000
        my_embedding = torch.zeros(1000)
        def copy_data(m, i, o):
            my_embedding.copy_(o.data)
        # 5. Attach that function to our selected layer
        h = alexnet_layer.register_forward_hook(copy_data)
        # 6. Run the model on our transformed image
        alexnet_model(t_img)
        h.remove()
    # 8. Return the feature vector
    return my_embedding

In [33]:
DATA_PATH = 'data/images/'
img1 = DATA_PATH + 'golden1.jpg'
# img2 = DATA_PATH + 'golden2.jpg'
img2 = DATA_PATH + 'cat1.jpg'

In [36]:
# get feature vectors from resnet18
n_trials = 10
img1_resnet_times = []
img2_resnet_times = []

# get feature vectors n_trial times each and average
for i in range(n_trials):
    # image 1
    t_start_img1 = time.time()
    img1_vector = get_vector(img1, 'resnet')
    img1_resnet_times.append(time.time() - t_start_img1)
    # image 2
    t_start_img2 = time.time()
    img2_vector = get_vector(img2, 'resnet')
    img2_resnet_times.append(time.time() - t_start_img2)

# output results
t_mean_img1_resnet = round(np.mean(img1_resnet_times)/n_trials, 3)
t_mean_img2_resnet = round(np.mean(img2_resnet_times)/n_trials, 3)
print('Number of Trials: {}'.format(n_trials))
print('Image 1 Feature Vector Genration Time: {}s'.format(t_mean_img1_resnet))
print('Image 2 Feature Vector Genration Time: {}s'.format(t_mean_img2_resnet))

n_frames = 10
n_corpus = 3e6
t_corpus = round(t_mean_img1_resnet*n_frames*n_corpus/3600, 3)
print('\nCorpus Size={}, Frames per Video={}'.format(int(n_corpus), n_frames))
print('Corpus Processing Time: {} hrs, or {} days'.format(t_corpus, round(t_corpus/24, 3)))

  if sys.path[0] == '':


Number of Trials: 10
Image 1 Feature Vector Genration Time: 0.012s
Image 2 Feature Vector Genration Time: 0.011s

Corpus Size=3000000, Frames per Video=10
Corpus Processing Time: 100.0 hrs, or 4.167 days


In [37]:
# Using PyTorch Cosine Similarity
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
cos_sim = cos(img1_vector.unsqueeze(0),
              img2_vector.unsqueeze(0))
print('\nCosine similarity: {0}\n'.format(cos_sim))


Cosine similarity: 
 0.5390
[torch.FloatTensor of size 1]




In [42]:
features_test = [np.array(img1_vector), np.array(img2_vector)]
features_test = np.array(features_test)

In [44]:
features_test

array([[2.264336  , 1.0769418 , 0.7329236 , ..., 0.42827195, 3.0381756 ,
        0.05954078],
       [0.10493794, 0.26243308, 0.761945  , ..., 1.0433315 , 0.18960439,
        0.7961156 ]], dtype=float32)

Resnet 18 experiments show that feature vector image generation takes aobut 1/100th of a second. This means over 45 days to process 3 million videos with 100 sampled images each. 

In [11]:
# get feature vectors from alexnet
# img1_vector_a = get_vector(img1, 'alexnet')
# img2_vector_a = get_vector(img2, 'alexnet')

In [12]:
img1_vec = np.array(img1_vector)
img2_vec = np.array(img2_vector)
vec = np.array([img1_vec, img2_vec])
print(vec.shape)

(2, 512)


## Frames extraction from videos

In [48]:
import os
import argparse
import FFMPEGFrames
import shutil
import importlib
from ffprobe3 import FFProbe
importlib.reload(FFMPEGFrames)

<module 'FFMPEGFrames' from '/Users/ryanbae/Dropbox/uw_data_science/fall_2018/data590_capstone1/VideoSimilarity/FFMPEGFrames.py'>

In [49]:
videos_path = 'data/videos'
delete = True
features = {}

# loop through all the videos
i = 0
for path, subdirs, files in os.walk(videos_path):
    for name in files:
        if i < 2:
            # extract frames from videos
            video_length = int(float(FFProbe(os.path.join(path, name)).video[0].duration)) + 1
            n_frames = 10
            fps = n_frames/video_length
            video_path = os.path.join(path, name)
            print(os.path.join(path, name) + ' ' + str(video_length) + 's ' + '(' + str(fps) + ')')
            f = FFMPEGFrames.FFMPEGFrames("data/video_frames/")
            f.extract_frames(os.path.join(path, name), fps)
            i += 1
            # get feature vectors for each frame image
            frames_path = f.full_output
            frames = os.listdir(frames_path)
            model = 'resnet'
            features[video_path] = [get_vector(os.path.join(frames_path, frame), model) for frame in frames]
            # delete rest of the files after extracting image features
            if delete:
                shutil.rmtree(f.full_output)

data/videos/work/7.mp4 13s (0.7692307692307693)


NameError: name 'get_vector' is not defined

In [76]:
test_dict = {'data/videos/0.mp4':np.array([1, 9, 1])}
test_dict

{'data/videos/0.mp4': array([1, 9, 1])}

In [80]:
video_path = 'data/videos/0.mp4'
data = np.array([1, 1, 1])

In [81]:
f = h5py.File("test.hdf5", "w")
f.create_dataset(video_path, data)

<HDF5 dataset "0.mp4": shape (1, 1, 1), type "<f4">

In [82]:
list(f.keys())

['data']

## Check the extracted features

In [109]:
import h5py

In [110]:
f1 = h5py.File('features.hdf5', 'r+')

In [111]:
# List all groups
print("Keys: %s" % f1.keys())
a_group_key = list(f1.keys())[0]

# # Get the data
data = list(f1[a_group_key])
data

Keys: KeysView(<HDF5 file "features.hdf5" (mode r+)>)


['videos']

In [112]:
data_names = f1.keys()
data_names

KeysView(<HDF5 file "features.hdf5" (mode r+)>)

In [113]:
f1['data/videos/work/0.mp4/output000003.png'][:]

array([1.32818490e-01, 2.65119839e+00, 2.25517464e+00, 5.31009197e-01,
       2.40465418e-01, 7.56578565e-01, 2.76097804e-01, 8.12232673e-01,
       1.53614259e+00, 4.39417481e-01, 1.83384049e+00, 1.24493694e+00,
       8.32031786e-01, 8.92312974e-02, 8.66587996e-01, 1.57974100e+00,
       4.57243264e-01, 1.04864717e+00, 1.51076984e+00, 4.40074623e-01,
       4.61099893e-02, 7.44705975e-01, 5.53882003e-01, 8.90647709e-01,
       1.29319811e+00, 1.43117678e+00, 3.04760993e-01, 4.97556180e-01,
       5.35472557e-02, 9.15293813e-01, 6.32018328e-01, 9.95632410e-01,
       2.99119726e-02, 5.26447408e-02, 2.33933151e-01, 7.16903269e-01,
       2.32425952e+00, 1.82374448e-01, 2.23342061e+00, 1.13279068e+00,
       7.23410726e-01, 1.07103217e+00, 7.12132573e-01, 1.27874923e+00,
       3.76490474e+00, 2.29532570e-01, 1.01108634e+00, 4.89228576e-01,
       2.04637671e+00, 3.19082618e-01, 1.20602441e+00, 6.69650495e-01,
       6.35978222e-01, 1.92972556e-01, 1.11094221e-01, 1.06720829e+00,
      