## 整体思路
赛题要求利用BROAD视频精彩片段数据集预测视频中竞猜片段的位置。
整体解决方案有以下三步：
1. 多尺度滑窗的视频分类模型
2. 利用非极大值抑制对多尺度滑窗预测结果进行合并
3. 利用小尺度滑窗分类模型进行结果边界修正

### 多尺度滑窗的视频分类模型
#### 分类模型
复赛中提供了视频的音频和图像特征。本方案利用音频和图像特征对小段视频进行精彩/不精彩的二分类。
**视频特征:** 模型的输入特征为音频特征与图像特征拼接而成的4096维作为视频每帧的输入特征。
**分类网络:** 视频数据本身属于时序数据。经过选择，本方案选择5层栈式双向LSTM网络作为分类网络。
网络结构如图：
![image description](https://cdn.kesci.com/images/lab_upload/1521105403986_32600.png)
#### 滑窗尺寸
本方案选择一下几个尺寸滑窗，分别生成提取二分类模型
80,100,110,120,150,180,200,240,270
#### 预测
用上述不同尺寸的滑窗模型，对测试集视频进行滑动分类，得到所有滑窗的预测结果。


In [None]:
### 网络模型
import cPickle
import sys
import json
import pandas as pd
import numpy as np
import gzip
import os
import random
import paddle.v2 as paddle
def lstm_net(input_dim,
                     class_dim=2,
                     hid_dim=512,
                     stacked_num=5,
                     is_infer=False):
    assert stacked_num % 2 == 1

    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
    para_attr = [fc_para_attr, lstm_para_attr]
    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
    relu = paddle.activation.Relu()
    linear = paddle.activation.Linear()

    data = paddle.layer.data("video",
                             paddle.data_type.dense_vector_sequence(input_dim))

    fc1 = paddle.layer.fc(input=data,
                          size=hid_dim,
                          act=linear,
                          bias_attr=bias_attr)
    lstm1 = paddle.layer.lstmemory(
        input=fc1, act=relu, bias_attr=bias_attr)

    inputs = [fc1, lstm1]
    for i in range(2, stacked_num + 1):
        fc = paddle.layer.fc(input=inputs,
                             size=hid_dim,
                             act=linear,
                             param_attr=para_attr,
                             bias_attr=bias_attr)
        lstm = paddle.layer.lstmemory(
            input=fc,
            reverse=(i % 2) == 0,
            act=relu,
            bias_attr=bias_attr)
        inputs = [fc, lstm]

    fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=paddle.pooling.Max())
    lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=paddle.pooling.Max())
    output = paddle.layer.fc(input=[fc_last, lstm_last],
                             size=class_dim,
                             act=paddle.activation.Softmax(),
                             bias_attr=bias_attr,
                             param_attr=para_attr)

    if not is_infer:
        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
        cost = paddle.layer.classification_cost(input=output, label=lbl)
        return cost, output, lbl
    else:
        return output

In [None]:
# 数据reader
def load_json(file):
    with open(file) as json_file:
        data = json.load(json_file)
        return data

    
#读取标签
def getLabelSet(data_type):
    data_dict = {}
    json_data = load_json("/mnt/BROAD-datasets/video/meta.json")
    database = json_data['database']
    for video_name in database.keys():
        video_info = database[video_name]
        video_subset = video_info["subset"]
        if video_subset == data_type:
            for index, item in enumerate(video_info['annotations']):
                video_info['annotations'][index] = item.values()[0]
            data_dict[video_name] = video_info['annotations']
    return data_dict


def getLabel(video, pos_st, pos_ed, data_dict):
    max_inter = 0
    for st, ed in data_dict[video]:
        intersection = max(0, min(ed, pos_ed) - max(st, pos_st))
        union = min(max(ed, pos_ed) - min(st, pos_st), ed - st + pos_ed - pos_st)
        overlap = float(intersection) / (union + 1e-8)
        max_inter = max(max_inter, intersection) 
        if overlap > 0.8:
            return 1
    # 丢弃部分负样本，防止政府样本差距过大
    if int(random.random()*100) > 10:
        return -1 
    return 0

def reader_creator(data_type, window_len=0, class_num=2):
    def multi_window_reader():
        json_data = load_json("/mnt/BROAD-datasets/video/meta.json")
        database = json_data['database']
        data_dict = getLabelSet(data_type)
        cnt = 0
        for video in database.keys():
            dataSet = database[video]["subset"]
            if dataSet != data_type:
                continue
            #使用部分数据加速训练
            if int(random.random()*100) > 40:
                continue
            try:
                with open("/mnt/BROAD-datasets/video/" + dataSet + "/image_resnet50_feature/" + str(video) + ".pkl", 'rb') as f:
                    image_fea = np.array(cPickle.load(f))
                with open("/mnt/BROAD-datasets/video/" + dataSet + "/audio_feature/" + str(video) + ".pkl", 'rb') as f:
                    audio_fea = np.array(cPickle.load(f))
            except:
                continue
                
            print cnt,video
            cnt+=1
            #对齐音频图像序列
            image_len = np.shape(image_fea)[0]
            audio_len = np.shape(audio_fea)[0]
            if image_len < audio_len:
                audio_fea = audio_fea[:image_len]
            if audio_len < image_len:
                image_fea = image_fea[:audio_len]
            video_fea = np.append(image_fea , audio_fea, axis=1)

            for wl in window_len:
                for pos in range(0, (np.shape(video_fea)[0] - wl), int(wl * 0.15)):
                    label = getLabel(video, pos, pos + wl, data_dict) 
                    if label < 0:
                        continue                     
                    yield video_fea[pos:pos + wl], label 
    return multi_window_reader

In [None]:
#模型训练
def _train(now_wl, cost, prob, label):
    num_passes = 10
    window_len = [now_wl]
    model_path = "/home/kesci/work/lstm_av_" +str(window_len[0])+ ".tar.gz"

    if os.path.exists(model_path) :
        with gzip.open(model_path, 'r') as f:
            parameters = paddle.parameters.Parameters.from_tar(f)
    else:
        parameters = paddle.parameters.create(cost)
    adam_optimizer = paddle.optimizer.Adam(                                                                           
        learning_rate=1e-3,
        regularization=paddle.optimizer.L2Regularization(rate=1e-3),                                                  
        model_average=paddle.optimizer.ModelAverage(average_window=0.5))                                              

    # create trainer
    trainer = paddle.trainer.SGD(
        cost=cost,                                                                                                    
        extra_layers=paddle.evaluator.auc(input=prob, label=label),                                                   
        parameters=parameters,                                                                                        
        update_equation=adam_optimizer)                                                                               

    # begin training network                                                                                          
    feeding = {"video": 0, "label": 1}                                                                                 
    def _event_handler(event):
        """
        Define end batch and end pass event handler                                                                   
        """
        if isinstance(event, paddle.event.EndIteration):                                                              
            if event.batch_id % 10 == 0: 
                print "Pass %d, Batch %d, Cost %f, %s\n" % (                                                    
                    event.pass_id, event.batch_id, event.cost, event.metrics)

        if isinstance(event, paddle.event.EndPass): 
            print 'model save'
            with gzip.open(model_path, "w") as f:                                                           
                parameters.to_tar(f)  
            print 'start test'
            result = trainer.test(reader=paddle.batch(reader_creator('validation',window_len,class_num=2),256), feeding=feeding)                                            
            print "Test at Pass %d, %s \n" % (event.pass_id,                                                
                                                        result.metrics)                                              
                                                                    

    trainer.train(
        reader=paddle.batch(paddle.reader.shuffle(reader_creator('training',window_len,class_num=2),1280),256),
        event_handler=_event_handler,
        feeding=feeding,
        num_passes=num_passes)
def train():
    dict_dim = 4096
    paddle.init(use_gpu=True, trainer_count=1)
    # network config                                                                                                  
    cost, prob, label =lstm_net(dict_dim)
    # 多尺度滑窗
    window_lens = [80,100,120,130,150,160,180,210,240,270]
    for wl in window_lens:
        _train(wl,cost,prob,label)

In [None]:
#预测
def infer():
    prob_layer = lstm_net(4096,is_infer=True)
    paddle.init(use_gpu=True, trainer_count=1)
    def _infer(data_type, window_len, proposal_data, window_stride):
        print window_len
        json_data = load_json("/mnt/BROAD-datasets/video/meta.json")
        database = json_data['database']
        window_lens = [window_len]
        model_path = "/home/kesci/work/lstm_av_" + str(window_len) +".tar.gz"
        # load the trained models
        if os.path.exists(model_path):
            with gzip.open(model_path, 'r') as f:
                parameters = paddle.parameters.Parameters.from_tar(f)
        index = 0
        for video in database.keys():
            dataSet = database[video]["subset"]
            if dataSet != data_type:
                continue
            try:
                with open("/mnt/BROAD-datasets/video/" + dataSet + "/image_resnet50_feature/" + str(video) + ".pkl", 'rb') as f:
                    image_fea = np.array(cPickle.load(f))
                with open("/mnt/BROAD-datasets/video/" + dataSet + "/audio_feature/" + str(video) + ".pkl", 'rb') as f:
                    audio_fea = np.array(cPickle.load(f))

            except:
                continue
            print index,video
            try:
                audio_fea = audio_fea[:np.shape(image_fea)[0]]
                image_fea = image_fea[:np.shape(audio_fea)[0]]
                video_fea = np.append(image_fea, audio_fea, axis=1)
            except:
                continue
            index += 1
            video_len = np.shape(video_fea)[0]
            this_vid_proposals = []
            inputs = []
            for pos in range(0, video_len - window_lens[0], window_stride):
                inputs.append([video_fea[pos:pos + window_lens[0]]])
            probs = paddle.infer(
                output_layer=prob_layer, parameters=parameters, input=inputs, field="value")
            for stride_index, prob in enumerate(probs):
                pos = stride_index * window_stride
                score = int(prob[1] * 100) / 100.0
                if score == 0.0:
                    continue
                proposal = {
                        'score': int(prob[1] * 100) / 100.0,
                        'segment': [pos, pos + window_lens[0]],
                        }
                this_vid_proposals += [proposal]
            if  not proposal_data['results'].has_key(video): 
                proposal_data['results'][video] = this_vid_proposals
            else :
                proposal_data['results'][video] += this_vid_proposals

    data_type = 'testing'
    window_lens = [80,100,120,130,150,160,180,210,240,270]
    window_strides = [9,12,16,16,16,16,16,16,16,16,16]
    proposal_data = {'results': {}, 'version': "VERSION 1.0"}
    for index, window_len in enumerate(window_lens):
        _infer(data_type, window_len, proposal_data, window_strides[index])
        with open("/home/kesci/work/res/" + data_type + ".json", 'w') as fobj:
            json.dump(proposal_data, fobj)


In [None]:
#首先进行train()
train()
#train之后重启kernel 进行infer
infer()

### 利用非极大值抑制对多尺度滑窗预测结果进行合并
利用非极大值抑制算法，对多尺度滑窗的预测结果进行处理，得到最终的预测结果。
此时的结果在测试集以及预测集上，可以达到31分左右。

In [None]:
### nms合并结果
def nms_detections(props, scores, overlap=0.7):
    props = np.array(props)
    scores = np.array(scores)
    t1 = props[:, 0]
    t2 = props[:, 1]
    ind = np.argsort(scores)
    area = (t2 - t1 + 1).astype(float)
    pick = []
    while len(ind) > 0:
        i = ind[-1]
        pick.append(i)
        ind = ind[:-1]
        tt1 = np.maximum(t1[i], t1[ind])
        tt2 = np.minimum(t2[i], t2[ind])
        wh = np.maximum(0., tt2 - tt1 + 1.0)
        o = wh / (area[i] + area[ind] - wh)
        ind = ind[np.nonzero(o <= overlap)[0]]
    nms_props, nms_scores = props[pick, :], scores[pick]
    return nms_props, nms_scores

def refine(data_type):
    proposal_data = {'results': {}, 'version': "VERSION 1.0"}
    json_data = load_json("res/" + data_type + ".json")
    database = json_data['results']
    for video in database.keys():
        this_vid_proposals = []
        probs = []
        scores = []
        for index, i in enumerate(database[video]):
            st, ed = i['segment']
            score = i['score']
            probs.append([st, ed])
            scores.append(i['score'])
        if len(probs) == 0:
            continue
        nms_props,nms_scores = nms_detections(probs, scores, 0.05)
        for prob, score in zip(nms_props, nms_scores):
            proposal = {
                    'score': score,
                    'segment': [prob[0], prob[1]],
                   }
            this_vid_proposals += [proposal]
        proposal_data['results'][video] = this_vid_proposals
    with open("res/" + data_type + "_refine.json", 'w') as fobj:
        json.dump(proposal_data, fobj)
#利用infer的结果进行合并
refine('testing')

### 利用小尺度滑窗分类模型进行结果边界修正
利用小尺寸滑窗例如8,12,16等进行二分类，通过小滑窗的预测模型，上步得到的结果进行边界修正。
具体修正策略详见代码。具体步骤如下
1. 利用小尺度滑窗训练二分类模型 
2. 利用小尺度滑窗二分类模型预测视频每个小段得分
3. 利用得到小尺度预测结果和上步中得到的预测结果，根据两者的重合度，修正边界

在本次比赛中，由于时间的限制，只使用16长的小滑窗，经过小尺度分类模型边界修正，在验证集和测试集上可以达到36分。
使用不同尺度的滑窗，可能得到更优化的结果。

In [None]:
# 小尺度滑窗分类reader
def mini_getLabel(video, pos_st, pos_ed, data_dict):
    max_inter = 0
    for st, ed in data_dict[video]:
        intersection = max(0, min(ed, pos_ed) - max(st, pos_st))
        union = min(max(ed, pos_ed) - min(st, pos_st), ed - st + pos_ed - pos_st)
        overlap = float(intersection) / (pos_ed - pos_st)
        if overlap > 0.9:
            return 1
    return 0

def mini_reader_creator(data_type, window_len=0, class_num=2):
    def multi_window_reader():
        json_data = load_json("/mnt/BROAD-datasets/video/meta.json")
        database = json_data['database']
        data_dict = getLabelSet(data_type)
        cnt = 0
        for video in database.keys():
            dataSet = database[video]["subset"]
            if dataSet != data_type:
                continue
            if int(random.random()*100) > 100:
                continue
            try:
                with open("/mnt/BROAD-datasets/video/" + dataSet + "/image_resnet50_feature/" + str(video) + ".pkl", 'rb') as f:
                    image_fea = np.array(cPickle.load(f))
                with open("/mnt/BROAD-datasets/video/" + dataSet + "/audio_feature/" + str(video) + ".pkl", 'rb') as f:
                    audio_fea = np.array(cPickle.load(f))
            except:
                continue
            cnt+=1
            image_len = np.shape(image_fea)[0]
            audio_len = np.shape(audio_fea)[0]
            if image_len < audio_len:
                audio_fea = audio_fea[:image_len]
            if audio_len < image_len:
                image_fea = image_fea[:audio_len]
            video_fea = np.append(image_fea , audio_fea, axis=1)
            pos_cnt = 0
            neg_cnt = 0
            for wl in window_len:
                for pos in range(0, (np.shape(video_fea)[0] - wl), wl):
                    label = mini_getLabel(video, pos, pos + wl, data_dict)                        
                    yield video_fea[pos:pos + wl], label 
    return multi_window_reader

In [None]:
#小尺度滑窗训练
def _mini_train(now_wl, cost, prob, label):
    num_passes = 6
    window_len = [now_wl]
    model_path = "/home/kesci/work/lstm_av_" +str(window_len[0])+ ".tar.gz"

    if os.path.exists(model_path) :
        with gzip.open(model_path, 'r') as f:
            parameters = paddle.parameters.Parameters.from_tar(f)
    else:
        parameters = paddle.parameters.create(cost)
    adam_optimizer = paddle.optimizer.Adam(                                                                           
        learning_rate=1e-3,
        regularization=paddle.optimizer.L2Regularization(rate=1e-3),                                                  
        model_average=paddle.optimizer.ModelAverage(average_window=0.5))                                              

    # create trainer
    trainer = paddle.trainer.SGD(
        cost=cost,                                                                                                    
        extra_layers=paddle.evaluator.auc(input=prob, label=label),                                                   
        parameters=parameters,                                                                                        
        update_equation=adam_optimizer)                                                                               

    # begin training network                                                                                          
    feeding = {"video": 0, "label": 1}                                                                                 
    def _event_handler(event):
        """
        Define end batch and end pass event handler                                                                   
        """
        if isinstance(event, paddle.event.EndIteration):                                                              
            if event.batch_id % 10 == 0: 
                print "Pass %d, Batch %d, Cost %f, %s\n" % (                                                    
                    event.pass_id, event.batch_id, event.cost, event.metrics)

        if isinstance(event, paddle.event.EndPass): 
            print 'model save'
            with gzip.open(model_path, "w") as f:                                                           
                parameters.to_tar(f)  
            print 'start test'
            result = trainer.test(reader=paddle.batch(mini_reader_creator('validation',window_len,class_num=2),256), feeding=feeding)                                            
            print "Test at Pass %d, %s \n" % (event.pass_id,                                                
                                                        result.metrics)                                              
                                                                    

    trainer.train(
        reader=paddle.batch(paddle.reader.shuffle(mini_reader_creator('training',window_len,class_num=2),1280),256),
        event_handler=_event_handler,
        feeding=feeding,
        num_passes=num_passes)
def mini_train():
    dict_dim = 4096
    paddle.init(use_gpu=True, trainer_count=1)
    # network config                                                                                                  
    cost, prob, label =lstm_net(dict_dim)
    window_lens = [16]
    for wl in window_lens:
        _mini_train(wl,cost,prob,label)

In [None]:
#小尺度滑窗预测
def mini_infer():
    prob_layer = lstm_net(4096,is_infer=True)
    paddle.init(use_gpu=True, trainer_count=1)
    def _infer(data_type, window_len, proposal_data, window_stride):
        print window_len
        json_data = load_json("/mnt/BROAD-datasets/video/meta.json")
        database = json_data['database']
        window_lens = [window_len]
        model_path = "/home/kesci/work/lstm_av_" + str(window_len) +".tar.gz"
        # load the trained models
        if os.path.exists(model_path):
            with gzip.open(model_path, 'r') as f:
                parameters = paddle.parameters.Parameters.from_tar(f)
        index = 0
        for video in database.keys():
            dataSet = database[video]["subset"]
            if dataSet != data_type:
                continue
            try:
                with open("/mnt/BROAD-datasets/video/" + dataSet + "/image_resnet50_feature/" + str(video) + ".pkl", 'rb') as f:
                    image_fea = np.array(cPickle.load(f))
                with open("/mnt/BROAD-datasets/video/" + dataSet + "/audio_feature/" + str(video) + ".pkl", 'rb') as f:
                    audio_fea = np.array(cPickle.load(f))

            except:
                continue
            print index,video
            try:
                audio_fea = audio_fea[:np.shape(image_fea)[0]]
                image_fea = image_fea[:np.shape(audio_fea)[0]]
                video_fea = np.append(image_fea, audio_fea, axis=1)
            except:
                print np.shape(audio_fea)
                print np.shape(image_fea)
                continue
            index += 1
            video_len = np.shape(video_fea)[0]
            this_vid_proposals = []
            inputs = []
            for pos in range(0, video_len - window_lens[0], window_stride):
                inputs.append([video_fea[pos:pos + window_lens[0]]])
            probs = paddle.infer(
                output_layer=prob_layer, parameters=parameters, input=inputs, field="value")
            for stride_index, prob in enumerate(probs):
                pos = stride_index * window_stride
                score = int(prob[1] * 100) / 100.0
                if score == 0.0:
                    continue
                proposal = {
                        'score': int(prob[1] * 100) / 100.0,
                        'segment': [pos, pos + window_lens[0]],
                        }
                this_vid_proposals += [proposal]
            if  not proposal_data['results'].has_key(video): 
                proposal_data['results'][video] = this_vid_proposals
            else :
                proposal_data['results'][video] += this_vid_proposals

    data_type = 'testing'
    window_lens = [16]
    window_strides = [16]
    proposal_data = {'results': {}, 'version': "VERSION 1.0"}
    for index, window_len in enumerate(window_lens):
        _infer(data_type, window_len, proposal_data, window_strides[index])
        with open("/home/kesci/work/res/" + data_type + "_mini.json", 'w') as fobj:
            json.dump(proposal_data, fobj)

In [None]:
def mini_refine(data_type):
    proposal_data = {'results': {}, 'version': "VERSION 1.0"}
    json_data = load_json("/home/kesci/work/res/" + data_type + "_refine.json")
    av = json_data['results']
    json_data = load_json("/home/kesci/work/res/" + data_type + "_mini.json")
    mini = json_data['results']
    def getOverlap(segment) :
        pos_st,pos_ed = segment
        in_len = 0
        last_st = 99999
        last_ed = 0
        for i in mini[video]:
            st, ed = i['segment']
            if ed < pos_st or st > pos_ed:continue
            intersection = max(0, min(ed, pos_ed) - max(st, pos_st))
            union = min(max(ed, pos_ed) - min(st, pos_st), ed - st + pos_ed - pos_st)
            overlap = float(intersection) / (pos_ed - pos_st)
            if overlap > 0 and i['score']>0.0:
                in_len+=16.0
                last_st = min(last_st, st)
                last_ed = max(last_ed, ed)
        return in_len/(pos_ed-pos_st), last_st, last_ed 
    for video in av.keys():
        this_vid_proposals = []
        for index, i in enumerate(av[video]):
            score = i['score']
            segment = i['segment']
            st,ed = segment
            overlap,last_st,last_ed = getOverlap(segment)
            if (overlap>0.2):
                st = last_st
                ed = last_ed
            else :
                continue
            proposal = {
                    'score': score,
                    'segment': [st, ed],
                   }
            this_vid_proposals += [proposal]
        proposal_data['results'][video] = this_vid_proposals
    with open("/home/kesci/work/res/" + data_type + "_mini_refine.json", 'w') as fobj:
        json.dump(proposal_data, fobj)

In [None]:
#首先进行小尺度窗口模型训练
mini_train()
#训练完成后进行小尺度窗口预测
mini_infer()
#得到小尺度窗口预测结果后与上步结果进行合并
mini_refine()