In [1]:
import os 
import json
import pickle
import datetime

import pandas as pd
from pprint import pprint
from bisect import bisect_left as find_prev
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

import xml.etree.ElementTree as xml

strptime = datetime.datetime.strptime

In [3]:
episodes_numbers = list(map(str, range(60, 70+1))) + list(map(str, range(79, 89+1)))
videos_data      = xml.parse('../facerec_segment/eastenders.collection.xml').getroot().findall("./VideoFile")

episodes_filenames = {v.find('id').text : v.find('filename').text
                      for v in videos_data
                      if v.find('id').text in episodes_numbers}
episodes_filenames

{'60': '5245830105934359183.mp4',
 '61': '5248439298566680341.mp4',
 '62': '5251033029316736209.mp4',
 '63': '5253646087419594905.mp4',
 '64': '5256228221757951173.mp4',
 '65': '5258818087037440299.mp4',
 '66': '5264050645694159353.mp4',
 '67': '5266645664934403562.mp4',
 '68': '5269245838135402929.mp4',
 '69': '5271844722846213628.mp4',
 '70': '5274434588125702436.mp4',
 '79': '5300410550331962313.mp4',
 '80': '5300433743159556513.mp4',
 '81': '5303026185415228226.mp4',
 '82': '5305617339184877785.mp4',
 '83': '5308226531817199474.mp4',
 '84': '5310822839547632763.mp4',
 '85': '5313398531435045858.mp4',
 '86': '5316007724067368375.mp4',
 '87': '5318609185758554501.mp4',
 '88': '5321170704253895214.mp4',
 '89': '5323778608396028180.mp4'}

In [4]:
data = []

for line in open('../facerec_segment/eastenders.masterShotReferenceTable.txt', 'r'):
    line = line.strip()
    if not line: continue
    file = line.split()[0].strip()
    if file in episodes_numbers:
        file = episodes_filenames[file]
        _, shot_id, start, end = line.split()
        ref = datetime.datetime(1900, 1, 1)
        start_s = (strptime(start[1:12], '%H:%M:%S:%f') - ref).total_seconds()
        end_s   = (strptime(end[1:12], '%H:%M:%S:%f') - ref).total_seconds()

        data.append({'file':file.replace('mp4', 'xml'), 
                     'shot': shot_id.split('_')[1], 
                     'start_s': start_s, 'end_s': end_s,
                     'start': start[1:12], 'end': end[1:12]})

In [5]:
df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,file,shot,start_s,end_s,start,end
0,5245830105934359183.xml,1,0.00,20.21,00:00:00:00,00:00:20:21
1,5245830105934359183.xml,2,20.22,23.20,00:00:20:22,00:00:23:20
2,5245830105934359183.xml,3,23.21,25.14,00:00:23:21,00:00:25:14
3,5245830105934359183.xml,4,25.15,26.20,00:00:25:15,00:00:26:20
4,5245830105934359183.xml,5,26.21,30.03,00:00:26:21,00:00:30:03
...,...,...,...,...,...,...
43431,5323778608396028180.xml,2032,6900.20,6901.08,01:55:00:20,01:55:01:08
43432,5323778608396028180.xml,2033,6901.09,6902.01,01:55:01:09,01:55:02:01
43433,5323778608396028180.xml,2034,6902.02,6903.04,01:55:02:02,01:55:03:04
43434,5323778608396028180.xml,2035,6903.05,6905.13,01:55:03:05,01:55:05:13


In [7]:
df.to_csv('mastershot_timing.csv')

In [8]:
captions = []
transcripts = []

for filename in os.listdir('transcripts'):
    paragraphs = xml.parse('transcripts/' + filename).getroot().findall('{http://www.w3.org/2006/10/ttaf1}body/*/*')
    for p in paragraphs:
        t = ' '.join(['' if not x.text else x.text.strip() for x in p.findall('*')])
        b = p.attrib['begin']
        e = p.attrib['end']
        if t and (t == t.upper()):
            captions.append({'filename': filename, 'begin':b, 'end':e, 'caption':t})
        else:
            transcripts.append({'filename': filename, 'begin':b, 'end':e, 'transcript':t})

df_captions = pd.DataFrame(captions) 
df_tts = pd.DataFrame(transcripts) 

In [9]:
df_tts

Unnamed: 0,filename,begin,end,transcript
0,5552368364300855101.xml,00:00:30.023,00:00:31.713,"Well, turn it round then."
1,5552368364300855101.xml,00:00:33.773,00:00:35.183,"Like it? Yeah, nice one."
2,5552368364300855101.xml,00:00:35.183,00:00:37.993,"Cheers, Phil. Well, you better not go showing..."
3,5552368364300855101.xml,00:00:37.993,00:00:39.773,"hit the canvas before the first bell goes, eh..."
4,5552368364300855101.xml,00:00:42.543,00:00:44.273,I'll give you a lift to the Deed Poll Office ...
...,...,...,...,...
16329,5544620672795594434.xml,00:29:06.038,00:29:10.768,You realise this is who they are and it's nev...
16330,5544620672795594434.xml,00:29:10.768,00:29:12.218,Snap.
16331,5544620672795594434.xml,00:29:18.928,00:29:20.988,Subtitles by Red Bee Media Ltd
16332,5544620672795594434.xml,00:29:20.988,00:29:23.048,E-mail subtitling@bbc.co.uk


In [10]:
df_captions

Unnamed: 0,filename,begin,end,caption
0,5552368364300855101.xml,00:00:31.713,00:00:33.773,HE CHUCKLES
1,5552368364300855101.xml,00:05:53.583,00:05:55.603,MOBILE RINGS
2,5552368364300855101.xml,00:08:31.473,00:08:33.063,PHIL CHUCKLES
3,5552368364300855101.xml,00:08:36.863,00:08:40.093,"RADIO PLAYS ""CHRISTMAS WRAPPING"" BY THE WAITR..."
4,5552368364300855101.xml,00:09:00.953,00:09:03.353,SHE CLEARS HER THROAT
...,...,...,...,...
337,5544620672795594434.xml,00:08:00.069,00:08:01.569,OK.
338,5544620672795594434.xml,00:10:12.364,00:10:13.354,DOORBELL RINGS
339,5544620672795594434.xml,00:13:33.894,00:13:35.394,DOOR CLOSES
340,5544620672795594434.xml,00:16:22.191,00:16:23.361,PHONE RINGS


In [11]:
df_tts

Unnamed: 0,filename,begin,end,transcript
0,5552368364300855101.xml,00:00:30.023,00:00:31.713,"Well, turn it round then."
1,5552368364300855101.xml,00:00:33.773,00:00:35.183,"Like it? Yeah, nice one."
2,5552368364300855101.xml,00:00:35.183,00:00:37.993,"Cheers, Phil. Well, you better not go showing..."
3,5552368364300855101.xml,00:00:37.993,00:00:39.773,"hit the canvas before the first bell goes, eh..."
4,5552368364300855101.xml,00:00:42.543,00:00:44.273,I'll give you a lift to the Deed Poll Office ...
...,...,...,...,...
16329,5544620672795594434.xml,00:29:06.038,00:29:10.768,You realise this is who they are and it's nev...
16330,5544620672795594434.xml,00:29:10.768,00:29:12.218,Snap.
16331,5544620672795594434.xml,00:29:18.928,00:29:20.988,Subtitles by Red Bee Media Ltd
16332,5544620672795594434.xml,00:29:20.988,00:29:23.048,E-mail subtitling@bbc.co.uk


In [12]:
ref = datetime.datetime(1900, 1, 1)
df_tts['begin_s'] = df_tts['begin'].apply(lambda t: (strptime(t, '%H:%M:%S.%f') - ref).total_seconds())
df_tts['end_s'] = df_tts['end'].apply(lambda t: (strptime(t, '%H:%M:%S.%f') - ref).total_seconds())
df_captions['begin_s'] = df_captions['begin'].apply(lambda t: (strptime(t, '%H:%M:%S.%f') - ref).total_seconds())
df_captions['end_s'] = df_captions['end'].apply(lambda t: (strptime(t, '%H:%M:%S.%f') - ref).total_seconds())

In [13]:
df_tts.head()

Unnamed: 0,filename,begin,end,transcript,begin_s,end_s
0,5552368364300855101.xml,00:00:30.023,00:00:31.713,"Well, turn it round then.",30.023,31.713
1,5552368364300855101.xml,00:00:33.773,00:00:35.183,"Like it? Yeah, nice one.",33.773,35.183
2,5552368364300855101.xml,00:00:35.183,00:00:37.993,"Cheers, Phil. Well, you better not go showing...",35.183,37.993
3,5552368364300855101.xml,00:00:37.993,00:00:39.773,"hit the canvas before the first bell goes, eh...",37.993,39.773
4,5552368364300855101.xml,00:00:42.543,00:00:44.273,I'll give you a lift to the Deed Poll Office ...,42.543,44.273


In [14]:
df_captions.head()

Unnamed: 0,filename,begin,end,caption,begin_s,end_s
0,5552368364300855101.xml,00:00:31.713,00:00:33.773,HE CHUCKLES,31.713,33.773
1,5552368364300855101.xml,00:05:53.583,00:05:55.603,MOBILE RINGS,353.583,355.603
2,5552368364300855101.xml,00:08:31.473,00:08:33.063,PHIL CHUCKLES,511.473,513.063
3,5552368364300855101.xml,00:08:36.863,00:08:40.093,"RADIO PLAYS ""CHRISTMAS WRAPPING"" BY THE WAITR...",516.863,520.093
4,5552368364300855101.xml,00:09:00.953,00:09:03.353,SHE CLEARS HER THROAT,540.953,543.353


In [15]:
df.head()

Unnamed: 0,file,shot,start_s,end_s,start,end
0,5245830105934359183.xml,1,0.0,20.21,00:00:00:00,00:00:20:21
1,5245830105934359183.xml,2,20.22,23.2,00:00:20:22,00:00:23:20
2,5245830105934359183.xml,3,23.21,25.14,00:00:23:21,00:00:25:14
3,5245830105934359183.xml,4,25.15,26.2,00:00:25:15,00:00:26:20
4,5245830105934359183.xml,5,26.21,30.03,00:00:26:21,00:00:30:03


In [16]:
df_tts[(df_tts.filename == '5552368364300855101.xml') & \
                         (df_tts.begin_s >= 30) & \
                         (df_tts.begin_s < 31)]

Unnamed: 0,filename,begin,end,transcript,begin_s,end_s
0,5552368364300855101.xml,00:00:30.023,00:00:31.713,"Well, turn it round then.",30.023,31.713


In [17]:
df[df.file == '5552368364300855101.xml']

Unnamed: 0,file,shot,start_s,end_s,start,end


In [18]:
shot_tts = []
shot_captions = []
for i, shot in tqdm(df.iterrows(), total=len(df)):
    shot_df_captions = df_captions[(df_captions.filename == shot['file']) & \
                                    (df_captions.begin_s >= shot['start']) & \
                                    (df_captions.begin_s < shot['end'])]
    shot_captions.append({'filename': shot['file'], 'shot_id': shot['shot'], 
                     'begin':shot['start'], 'end': shot['end'], 
                     'caption': ' '.join(shot_df_captions['caption'].values)})
    
    
    shot_df_tts = df_tts[(df_tts.filename == shot['file']) & \
                         (df_tts.begin_s >= shot['start']) & \
                         (df_tts.begin_s < shot['end'])]
    shot_tts.append({'filename': shot['file'], 'shot_id': shot['shot'], 
                     'begin':shot['start'], 'end': shot['end'], 
                     'transcript': ' '.join(shot_df_tts['transcript'].values)})

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=43436.0), HTML(value='')))




TypeError: Invalid comparison between dtype=float64 and str

In [30]:
df_shot_tts = pd.DataFrame(shot_tts)
df_shot_captions = pd.DataFrame(shot_captions)

In [31]:
df_shot_captions[df_shot_captions.caption != '']

Unnamed: 0,filename,shot_id,begin,end,caption
119,5531550228324592939.xml,120,313.11,315.23,DOOR SLAMS
237,5531550228324592939.xml,238,679.15,688.00,BABY CRIES
246,5531550228324592939.xml,247,746.20,750.16,THEY LAUGH
262,5531550228324592939.xml,263,777.11,779.17,THEY LAUGH
345,5531550228324592939.xml,346,990.16,991.16,CRASHING AND GLASS BREAKING
...,...,...,...,...,...
21152,5555360238519252381.xml,1677,5081.22,5083.10,MACHINE BLEEPS
21154,5555360238519252381.xml,1679,5087.08,5091.13,MACHINE BLEEPS FASTER
21158,5555360238519252381.xml,1683,5095.23,5098.11,SHE SCREAMS ALARM ON MACHINE BLEEPS
21159,5555360238519252381.xml,1684,5098.12,5105.03,SHE STRUGGLES FOR BREATH


In [36]:
df_shot_captions.to_csv('shot-aligned_captions.csv')

In [37]:
df_shot_tts[df_shot_tts.transcript != '']

Unnamed: 0,filename,shot_id,begin,end,transcript
6,5531550228324592939.xml,7,25.06,28.07,"Getting all dolled up, going somewhere nice to..."
9,5531550228324592939.xml,10,29.08,31.15,"Yeah, curry night at Dad's."
10,5531550228324592939.xml,11,31.16,34.15,Wicked. If you like yesterday's chicken with ...
12,5531550228324592939.xml,13,35.01,36.10,In your condition it's nice to have someone c...
13,5531550228324592939.xml,14,36.11,44.08,"My condition? I'm pregnant, not paralysed. Ev..."
...,...,...,...,...,...
21160,5555360238519252381.xml,1685,5105.04,5106.14,Bye-bye.
21164,5555360238519252381.xml,1689,5112.16,5156.10,Subtitles by Red Bee Media Ltd E-mail subtitl...
21200,5555360238519252381.xml,1725,5214.07,5216.23,You take control in Shadows Of The Vashta Ner...
21201,5555360238519252381.xml,1726,5216.24,5218.03,the latest online adventure game.


In [38]:
df_shot_tts.to_csv('shot-aligned_transcripts.csv')