In [2]:
import torch 
import numpy as np
import cv2
import time, glob, shutil, datetime
from matplotlib import pyplot as plt
import csv, json
import xml.etree.ElementTree as ET 
import pandas as pd
from collections import defaultdict
from syncnet_python.facetrack import *
from syncnet_python.syncnet import *
from itertools import cycle

In [3]:
class VideoIterableDataset(torch.utils.data.IterableDataset):
    
    def __init__(self, data_dir):
        super(VideoIterableDataset).__init__()
        self.utts = []
        self.avis = []
        for utt in glob.glob(data_dir+'*'):
            self.utts.append(utt)
            frames_dir = utt+'/pyframes/'
            if os.path.exists(frames_dir):
                rmtree(frames_dir)
            os.makedirs(frames_dir)
    
    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            offset = 0
            shift = 1
        else:
            offset = worker_info.id
            shift = worker_info.num_workers
        for i in range(offset, len(self.utts), shift):
            yield self.utts[i], self.load_frames(self.utts[i])
    
    def load_frames(self, utt):
        
        videofile = os.path.join(utt,'pyavi','video.avi')
        output = os.path.join(utt,'pyframes','%06d.jpg')
        command = f"ffmpeg -loglevel quiet -y -i {videofile} -qscale:v 2 -threads 1 -f image2 {output}"
        output = subprocess.call(command, shell=True, stdout=None)
        flist = glob.glob(utt+'/pyframes/*.jpg')
        flist.sort()
        frames = []
        for fname in flist:
            image = cv2.imread(fname)
            frames.append(image)
        return np.array(frames)

def cut_into_utterances(filename, output_dir, maxWMER=1000):
    
    xmldir = "/afs/inf.ed.ac.uk/group/cstr/datawww/asru/MGB1/data/xml"
    xmlfile = os.path.join(xmldir, filename+'.xml')
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    utterance_items = []
    paths = glob.glob(f"/afs/inf.ed.ac.uk/group/project/nst/bbcdata/ptn*/**/{filename}*.ts") \
    + glob.glob(f"/afs/inf.ed.ac.uk/group/project/nst/bbcdata/raw/{filename}*.ts")
    inputVideo = paths[0]
    command_elems = ["ffmpeg -loglevel quiet -y -i " + inputVideo]
    for item in root.findall('./body/segments/segment'):
        if (item.attrib['id'].split('_')[-1]=='align' and float(item.attrib['WMER'])<=maxWMER):
            if (float(item.attrib['endtime']) - float(item.attrib['starttime'])<2):
                continue                        
            location = output_dir + item.attrib['id']
            ready_to_crop = prepare_output_directory(location)
            if ready_to_crop:
                utterance_items.append(item)
                data = item.attrib
                start = datetime.timedelta(seconds=float(data['starttime']))
                end = datetime.timedelta(seconds=float(data['endtime']))
                output = os.path.join(location, 'pyavi', 'video.avi')
                command_elems.append(" -ss " + str(start) + " -to " + str(end) + " -c copy " + output) # -c:a mp3 -c:v mpeg4
                create_transcript_from_XML(location, item)
    command = "".join(command_elems)
    s = time.time()
    result = subprocess.run(command, shell=True, stdout=None)
    if result.returncode != 0:
        print(f"ERROR: ffmpeg failed to trim video: {filename}")
        print(f"result: {result}")
    t = time.time() - s
    print(f"Took {t} seconds to trim {len(command_elems)-1} utterances")
    return utterance_items

def getGenre(filename):
    xmldir = "/afs/inf.ed.ac.uk/group/cstr/datawww/asru/MGB1/data/xml/"
    xmlfile = os.path.join(xmldir, filename+'.xml')
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    head = root.find('./head/recording')
    genre = head.attrib["genre"]
    return genre
    
def prepare_output_directory(location):
    ready = True
    incomplete_directory_exists = os.path.isdir(location) and not os.path.exists(f"{location}/utterance_info.csv")
    if(incomplete_directory_exists):
        shutil.rmtree(location)
    elif(os.path.isdir(location)):
        ready = False
        return ready  #  This utterance has been processed already.
    else:
        pass
    subprocess.run("mkdir -p " + location + "/pyavi/", stdout=subprocess.DEVNULL, shell=True)    
    return ready

def create_transcript_from_XML(location, item):
    # TODO: the transcript should only contain the words spoken in the final cropped video. 
    utterance = ""
    for child in item:
        utterance+=child.text + " "
    data = item.attrib
    data.update({"utterance": utterance})
    with open(location + '/transcript.txt', 'w') as outfile:
        outfile.write(str(data))

In [4]:
data_dir = '/disk/scratch/s1768177/pipeline/output_data/'
filelist = "/afs/inf.ed.ac.uk/group/cstr/datawww/asru/MGB1/scripts/dev.full"
desired_genres = ["drama", "childrens", "news", "documentary"]

In [6]:
count = 1
with open(filelist, "r") as f:
    files = f.read().split()
files = files[15:17]
print(files)
print(f"\n{datetime.datetime.now()}. Cutting utterances from raw videos.")
total_utterances_processed = 0
for filename in files:
    genre = getGenre(filename)
    if (genre in desired_genres):
        print(f"{count}. {filename}. ({genre}) ")
        count += 1
        utterance_items = cut_into_utterances(filename, data_dir)
        total_utterances_processed += len(utterance_items)
print(f"\nFinished Cutting total {total_utterances_processed} utterances from {count-1} videos")

['20080508_173000_bbctwo_great_british_menu', '20080509_013000_bbcone_bill_oddie_s_wild_side']

2021-07-29 00:10:57.132481. Cutting utterances from raw videos.
1. 20080509_013000_bbcone_bill_oddie_s_wild_side. (documentary) 
Took 95.59221959114075 seconds to trim 212 utterances

Finished Cutting total 212 utterances from 1 videos


In [None]:
start = time.time()
dataset = VideoIterableDataset(data_dir)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=None, shuffle=False, num_workers=24)
facetrack = FaceTrack('cuda:3')
for i, (utt, frames) in enumerate(dataloader):
    print(i, utt.split('/')[-1], len(frames))
    facetrack.run(data_dir=utt, frames=frames)
    no_faces_found = len(os.listdir(utt + "/pycrop/")) == 0
    if(no_faces_found):
        shutil.rmtree(utt)
        
print(f"Time taken: {(time.time()-start)/60:.2f} minutes")

In [None]:
# class SyncNetIterableDataset(torch.utils.data.IterableDataset):    
#     def __init__(self, path):
#         super(SyncNetIterableDataset).__init__()
#         self.avis = []
#         for avi in glob.glob(data_dir+'*/pycrop/*.avi'):
#             self.avis.append(avi)
    
#     def __iter__(self):
#         worker_info = torch.utils.data.get_worker_info()
#         if worker_info is None:
#             offset = 0
#             shift = 1
#         else:
#             offset = worker_info.id
#             shift = worker_info.num_workers
#         for i in range(offset, len(self.avis), shift):
#             utt = self.avis[i].split('/pycrop/')[0]
#             yield utt, self.avis[i], self.load_frames(self.avis[i])
    
#     def load_frames(self, videofile):
#         cap = cv2.VideoCapture(videofile)
#         frame_num = 1;
#         frames = []
#         while frame_num:
#             frame_num += 1
#             ret, image = cap.read()
#             if ret == 0:
#                 break
#             frames.append(cv2.resize(image, (224, 224)))
#         cap.release()
#         cv2.destroyAllWindows()
#         frames = [frames[0], frames[0]] + frames + [frames[-1], frames[-1]]
#         frames = np.stack(frames, axis=3)
#         frames = np.transpose(frames, (2,3,0,1))
#         frames = np.array([frames[:,i:i+5,:,:] for i in range(0, frames.shape[1] - 4)], dtype='float32')
#         return frames
    
class SyncNetIterableDataset(torch.utils.data.IterableDataset):    
   
    def __init__(self, path):
        super(SyncNetIterableDataset).__init__()
        self.avis = []
        for avi in glob.glob(path+'*/pycrop/*.avi'):
            self.avis.append(avi)
            tmp_dir = avi.split('pycrop')[0]+'/pytmp/'
            if os.path.exists(tmp_dir):
                rmtree(tmp_dir)
            os.makedirs(tmp_dir)
    
    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            offset = 0
            shift = 1
        else:
            offset = worker_info.id
            shift = worker_info.num_workers
        for i in range(offset, len(self.avis), shift):
            utt = self.avis[i].split('/pycrop/')[0]
            yield utt, self.avis[i], self.load_frames(self.avis[i]), self.load_audio(utt, self.avis[i])
    
    def load_audio(self, utt, videofile):
        command = f"ffmpeg -loglevel quiet -y -i {videofile} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {os.path.join(utt,'pytmp/audio.wav')}"
        output = subprocess.call(command, shell=True, stdout=None)
        sample_rate, audio = wavfile.read(os.path.join(utt,'pytmp/audio.wav'))
        return (sample_rate, audio)
    
    def load_frames(self, videofile):
        cap = cv2.VideoCapture(videofile)
        frame_num = 1;
        frames = []
        while frame_num:
            frame_num += 1
            ret, image = cap.read()
            if ret == 0:
                break
            frames.append(cv2.resize(image, (224, 224)))
        cap.release()
        cv2.destroyAllWindows()
        frames = [frames[0], frames[0]] + frames + [frames[-1], frames[-1]]
        frames = np.stack(frames, axis=3)
        frames = np.transpose(frames, (2,3,0,1))
        frames = np.array([frames[:,i:i+5,:,:] for i in range(0, frames.shape[1] - 4)], dtype='float32')
        return frames
    
    




In [None]:
# dataset = SyncNetIterableDataset(data_dir)
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=None, shuffle=False, num_workers=16)
# syncnet = SyncNet()
# for i, (utt, avi, frames) in enumerate(dataloader):
#     print(i, utt.split('/')[-1], avi.split('/')[-1], len(frames))
#     syncnet.setup(utt)
#     offset, conf, dist = syncnet.evaluate(avi,frames)
#     print(offset, conf)

dataset = SyncNetIterableDataset(data_dir)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=None, shuffle=False, num_workers=24)
syncnet = SyncNet()
for i, (utt, avi, frames, (sample_rate, audio)) in enumerate(dataloader):
    print(i, utt, avi.split('/')[-1], len(frames))
    syncnet.setup(utt)
    offset, conf, dist = syncnet.evaluate(avi,frames,sample_rate,audio)
    print(offset, conf)


In [None]:
def cleanup(dataset_dir):
    for utterance in os.listdir(dataset_dir):
        source = os.path.join(dataset_dir, utterance, 'pycrop')
        dest = os.path.join(dataset_dir, utterance)
        for f in os.listdir(source):
            new_path = shutil.move(f"{source}/{f}", f"{dest}/{f}")
        for f in glob.glob(f"{dest}/py*"):
            shutil.rmtree(f)
            
cleanup(data_dir)

In [None]:

videofile = "/disk/scratch/s1768177/pipeline/output_data/ID20080505_000500_bbcone_weatherview_utt_8_align/pyavi/video.avi"
# videofile_avi = "/disk/scratch/s1768177/pipeline/output_data/ID20080505_000500_bbcone_weatherview_utt_8_align/pyavi/tracks/video.avi"
# videofile = "playground/lrs3test/00006.mp4"
cap = cv2.VideoCapture(videofile)
frames = []
while True:
    ret, image = cap.read()
    if ret == 0:
        break
    frames.append(image)
# print(f"The shape of frames is: {np.array(frames).shape}")
# for i in range(len(frames)):
#     cv2.imwrite(f"playground/cv2images/{i}.jpg", frames[i])
print(len(frames))

import subprocess
# command = ("ffmpeg -loglevel quiet -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (videofile, videofile_avi))
# output = subprocess.call(command, shell=True, stdout=None)
command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,'playground/images/%06d.jpg')) 
output = subprocess.call(command, shell=True)
flist = glob.glob('playground/images/*.jpg')
flist.sort()
print(len(flist))

In [None]:
# flist = glob.glob('playground/images/*.jpg')
# flist.sort()
# for image in flist:
#     print(cv2.imread(image).shape)
#     print(frames[50].shape)
#     if (cv2.imread(image) == frames[50]).all():
#         print("got it!")

    
