In [6]:
import cv2
import numpy as np
import librosa


def video_prepare(video_path, width, height):
    vidcap = cv2.VideoCapture(video_path)
    success, image = vidcap.read()
    image = cv2.resize(image,(width, height),interpolation=cv2.INTER_CUBIC)
    fps = int(vidcap.get(cv2.CAP_PROP_FPS))
    res = []
    count = 0
    while success:
        if count % fps == 0:
            image = cv2.resize(image,(width, height),interpolation=cv2.INTER_CUBIC)
            res.append(image)
        print('Process %dth seconds: ' % int(count / fps), success)
        success, image = vidcap.read()
        count += 1
    
    for i in range(60 - len(res)):
        res.append(np.zeros(shape=(width, height, 3)))

    return res


def bgm_prepare(bgm_path, config):
    data, _ = librosa.core.load(bgm_path, sr=config.sampling_rate, res_type="kaiser_fast")
    res = []
    max_len = len(data)
    for idx in range(60):
        if idx * config.sampling_rate <= max_len:
            pre_idx = idx * config.sampling_rate - config.sampling_rate / 2
            post_idx = idx * config.sampling_rate + config.sampling_rate / 2
            pre_idx = pre_idx if pre_idx >= 0 else 0
            post_idx = post_idx if post_idx < max_len else max_len - 1
            idx_part = data[pre_idx : post_idx + 1]
            idx_part = np.pad(idx_part,
                              (-pre_idx if pre_idx < 0 else 0,
                               post_idx - max_len + 1 if (post_idx - max_len + 1) > 0 else 0),
                              "constant", constant_values=(0, 0))
            idx_melspec = librosa.feature.melspectrogram(idx_part, sr=config.sampling_rate, n_mels=200)
            idx_logspec = librosa.core.power_to_db(idx_melspec)
            idx_logspec = np.expand_dims(idx_logspec, axis=-1)
            res.append(idx_logspec)

        else:
            res.append(np.zeros(shape=(config.dim[0], config.dim[1], 1)))

    return res

In [34]:
class Config(object):
    def __init__(self,
                 sampling_rate=44100, audio_duration=1, n_classes=41,
                 use_mfcc=False, n_folds=10, learning_rate=0.0001,
                 max_epochs=50, n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [2]:
bgm_file = "data/1574688556018260600.mp4"
video_file = "data/1586567597094659207.mp4"

In [7]:
x_video = video_prepare(video_file, 256, 256)

Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 0th seconds:  True
Process 1th seconds:  True
Process 1th seconds:  True
Process 1th seconds:  True
Process 1th seconds:  True
Process 1th seconds:  True
Process 1th seconds:  True
Process 1th seconds:  True
P

Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 10th seconds:  True
Process 11th seconds:  True
Process 11th seconds:  True
Process 11th seconds:  True
Process 11th seconds:  True
Process 11th seconds:  True
Process 11th seconds:  True
Process 11th seconds:  True
Process 11th seconds:  True
Process 11th seconds:  True
Process 11th seconds

Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 21th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds:  True
Process 22th seconds

Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 34th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds:  True
Process 35th seconds

Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 46th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds:  True
Process 47th seconds

In [13]:
config = Config(sampling_rate=44100, audio_duration=1, n_folds=10,
                    learning_rate=0.001, use_mfcc=True, n_mfcc=128)
x_bgm = bgm_prepare(bgm_file, config)



TypeError: slice indices must be integers or None or have an __index__ method

In [18]:
def bgm_prepare(bgm_path, config):
    data, _ = librosa.core.load(bgm_path, sr=config.sampling_rate, res_type="kaiser_fast")
    res = []
    max_len = len(data)
    for idx in range(60):
        if idx * config.sampling_rate <= max_len:
            pre_idx = idx * config.sampling_rate - config.sampling_rate / 2
            post_idx = idx * config.sampling_rate + config.sampling_rate / 2
            pre_idx = pre_idx if pre_idx >= 0 else 0
            post_idx = post_idx if post_idx < max_len else max_len - 1
            idx_part = data[pre_idx : post_idx + 1]
            idx_part = np.pad(idx_part,
                              (-pre_idx if pre_idx < 0 else 0,
                               post_idx - max_len + 1 if (post_idx - max_len + 1) > 0 else 0),
                              "constant", constant_values=(0, 0))
            idx_melspec = librosa.feature.melspectrogram(idx_part, sr=config.sampling_rate, n_mels=200)
            idx_logspec = librosa.core.power_to_db(idx_melspec)
            idx_logspec = np.expand_dims(idx_logspec, axis=-1)
            res.append(idx_logspec)

        else:
            res.append(np.zeros(shape=(config.dim[0], config.dim[1], 1)))

    return res

(256, 256, 3)

In [37]:
config = Config(sampling_rate=44100, audio_duration=1, n_folds=10,
                    learning_rate=0.001, use_mfcc=True, n_mfcc=200)
data, _ = librosa.core.load(bgm_file, sr=config.sampling_rate, res_type="kaiser_fast")
res = []
max_len = len(data)
for idx in range(60):
    if idx * config.sampling_rate <= max_len:
        pre_idx_ = int(idx * config.sampling_rate - config.sampling_rate / 2)
        post_idx_ = int(idx * config.sampling_rate + config.sampling_rate / 2)
        pre_idx = int(pre_idx_) if pre_idx_ >= 0 else 0
        post_idx = int(post_idx_) if post_idx_ < max_len else max_len - 1
#         print(pre_idx_, post_idx_)
        idx_part = data[pre_idx : post_idx + 1]
        idx_part = np.pad(idx_part,
                          (-pre_idx_ if pre_idx_ < 0 else 0,
                           post_idx_ - max_len + 1 if (post_idx_ - max_len + 1) > 0 else 0),
                          "constant", constant_values=(0, 0))
#         print(np.array(idx_part).shape)
        idx_melspec = librosa.feature.melspectrogram(idx_part, sr=config.sampling_rate, n_mels=200)
        idx_logspec = librosa.core.power_to_db(idx_melspec)
        idx_logspec = np.expand_dims(idx_logspec, axis=-1)
#         print(idx_logspec.shape)
        res.append(idx_logspec)

    else:
#         print(config.dim[0], config.dim[1])
        res.append(np.zeros(shape=(config.dim[0], config.dim[1], 1)))



In [38]:
np.array(res).shape

(60, 200, 87, 1)

In [39]:
import torch
import torch.nn as nn
from torchvision import models


def set_parameter_requires_grad(model, requires_grad):
    if requires_grad:
        for param in model.parameters():
            param.requires_grad = False


class ResnetExtractor(nn.Module):
    def __init__(self, out_dim, use_pretrained=True,  requires_grad=False):
        super(ResnetExtractor, self).__init__()
        self.model_resnet50 = models.resnet50(pretrained=use_pretrained)
        set_parameter_requires_grad(self.model_resnet50, requires_grad)
        self.num_ftrs = self.model_resnet50.fc.in_features
        self.model_fc_video = nn.Sequential(
            nn.Linear(self.num_ftrs, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, out_dim),
            nn.ReLU()
        )
        self.model_fc_bgm = nn.Sequential(
            nn.Linear(self.num_ftrs, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, out_dim),
            nn.ReLU()
        )

    def forward(self, input_video, input_bgm):
        x_video = self.model_resnet50(input_video)
        x_video = self.model_fc_video(x_video)

        x_bgm = self.model_resnet50(input_bgm)
        x_bgm = self.model_fc_bgm(x_bgm)

        return x_video, x_bgm


class VMBModel(nn.Module):
    def __init__(self, pos_nums, fea_dim, out_dim, res_use_pretrained=True,  res_requires_grad=False):
        super(VMBModel, self).__init__()
        self.pos_nums = pos_nums
        self.fea_dim = fea_dim
        self.out_dim = out_dim

        """
            init feature extractor layer
        """
        self.model_resnet50 = ResnetExtractor(self.fea_dim, res_use_pretrained, res_requires_grad)

        """
            init beat merge layer
        """
        self.video_att_layer = nn.Linear(self.fea_dim * self.pos_nums, self.pos_nums)
        self.video_att_activation = nn.Softmax(dim=1)
        self.video_att_bn1d = nn.BatchNorm1d(self.pos_nums)
        self.bgm_att_layer = nn.Linear(self.fea_dim * self.pos_nums, self.pos_nums)
        self.bgm_att_activation = nn.Softmax(dim=1)
        self.bgm_att_bn1d = nn.BatchNorm1d(self.pos_nums)


        """
            init beat encoding layer
        """
        self.video_fc = nn.Linear(self.pos_nums, self.pos_nums)
        self.video_fc_activation = nn.ReLU()
        self.video_bn1d = nn.BatchNorm1d(self.pos_nums)
        self.bgm_fc = nn.Linear(self.pos_nums, self.pos_nums)
        self.bgm_fc_activation = nn.ReLU()
        self.bgm_bn1d = nn.BatchNorm1d(self.pos_nums)

        """
            init tower layer
        """
        self.video_tower_fc = nn.Linear(self.fea_dim, self.out_dim)
        self.bgm_tower_fc = nn.Linear(self.fea_dim, self.out_dim)

    def forward(self, input_video, input_bgm):
        video_fea = []
        bgm_fea = []

        """
           feature extractor part with resnet
        """
        for tmp_v, tmp_b in zip(input_video, input_bgm):
            tmp_v_out, tmp_b_out = self.model_resnet50(tmp_v, tmp_b)
            video_fea.append(tmp_v_out)
            bgm_fea.append(tmp_b_out)

        """
           beat merge part
        """
        video_fea_t = torch.tensor(video_fea, dtype=torch.float32).view(-1)
        bgm_fea_t = torch.tensor(bgm_fea, dtype=torch.float32).view(-1)

        video_att = self.video_att_activation(self.video_att_layer(video_fea_t))
        bgm_att = self.bgm_att_activation(self.bgm_att_layer(bgm_fea_t))

        video_fea_merge = 0
        bgm_fea_merge = 0
        for ind in range(self.pos_nums):
            video_fea_merge += video_fea[ind] * video_att[:, ind].view((video_att.size()[0], 1))
            bgm_fea_merge += bgm_fea[ind] * bgm_att[:, ind].view((bgm_att.size()[0], 1))

        video_fea_merge = self.video_att_bn1d(video_fea_merge)
        bgm_fea_merge = self.bgm_att_bn1d(bgm_fea_merge)

        """
           beat encoding part
        """
        video_att_encoding = self.video_bn1d(self.video_fc_activation(self.video_fc(video_att)))
        bgm_att_encoding = self.bgm_bn1d(self.bgm_fc_activation(self.bgm_fc(bgm_att)))

        """
           tower part
        """
        video_tower_out = self.video_tower_fc(video_fea_merge)
        bgm_tower_out = self.bgm_tower_fc(bgm_fea_merge)

        return video_att_encoding, bgm_att_encoding, video_tower_out, bgm_tower_out

ImportError: dlopen(/Users/zhaoheng/dataology/anaconda/anaconda3/envs/python36/lib/python3.6/site-packages/torchvision/_C.cpython-36m-darwin.so, 2): Symbol not found: _THPVariable_Wrap
  Referenced from: /Users/zhaoheng/dataology/anaconda/anaconda3/envs/python36/lib/python3.6/site-packages/torchvision/_C.cpython-36m-darwin.so
  Expected in: flat namespace
 in /Users/zhaoheng/dataology/anaconda/anaconda3/envs/python36/lib/python3.6/site-packages/torchvision/_C.cpython-36m-darwin.so