# SRT-file to CSV-file
## Input: SRT-file
## Output: CSV-fle

In [22]:
import pandas as pd
import pysrt
import time

In [23]:
# load subs
subs = pysrt.open('data/les_mis.srt')

# initialize dataframe
df = pd.DataFrame(columns=['speaker', 'addressed_recipients', 'all_recipients', 'text', 'start_hhmmss', 'end_hhmmss', 'start', 'end'])

In [24]:
# fill dataframe
for i in range(len(subs)):

    # get sub start- and end-time in seconds
    start = subs[i].start.ordinal/1000
    end = subs[i].end.ordinal/1000

    # timspan for current sub
    timespan = end-start

    # get total number of letters in text
    text = subs[i].text
    text_len = len(''.join(filter(str.isalpha, text)))

    # check if text-block only has one speaker
    # (https://www.medietextarna.se/wp-content/uploads/2020/11/Guidelines-for-subtitling-in-Sweden-2020.pdf)
    text_lines = subs[i].text.split('\n')

    # if true, one speaker
    if len(text_lines) == 1:
        
        # convert seconds to hhmmss format
        start_hhmmss = time.strftime('%H:%M:%S', time.gmtime(round(start)))
        end_hhmmss = time.strftime('%H:%M:%S', time.gmtime(round(end)))

        # clean text
        text = text.replace('\n', ' ')
        
        # add data to dataframe
        row = {'text': text, 'start': start, 'end': end, 'start_hhmmss': start_hhmmss, 'end_hhmmss': end_hhmmss}
        df = df._append(row, ignore_index = True)

    # if true, one speaker
    elif text_lines[0][0] != '-' or text_lines[1][0] != '-':

        # convert seconds to hhmmss format
        start_hhmmss = time.strftime('%H:%M:%S', time.gmtime(round(start)))
        end_hhmmss = time.strftime('%H:%M:%S', time.gmtime(round(end)))

        # clean text
        text = text.replace('\n', ' ')
        
        # add data to dataframe
        row = {'text': text, 'start': start, 'end': end, 'start_hhmmss': start_hhmmss, 'end_hhmmss': end_hhmmss}
        df = df._append(row, ignore_index = True)

    # if there are two speakers
    else:
        
        # calc start- and end-time for each line, and save line to dataframe
        line_start = start
        for line in text_lines:
    
            # calc lines text-fraction
            line_len = len(''.join(filter(str.isalpha, line)))
            line_frac = line_len/text_len
    
            # calc end-time for line
            line_end = line_start + line_frac*timespan
    
            # convert seconds to hhmmss format
            start_hhmmss = time.strftime('%H:%M:%S', time.gmtime(round(line_start)))
            end_hhmmss = time.strftime('%H:%M:%S', time.gmtime(round(line_end)))

            # clean text 
            text = line[1:]

            
            # add data to dataframe
            row = {'text': text, 'start': line_start, 'end': line_end, 'start_hhmmss': start_hhmmss, 'end_hhmmss': end_hhmmss}
            df = df._append(row, ignore_index = True)
    
            # set next line start-time
            line_start = line_end   

  df = df._append(row, ignore_index = True)


In [26]:
df.head(11)

Unnamed: 0,speaker,addressed_recipients,all_recipients,text,start_hhmmss,end_hhmmss,start,end
0,,,,You can't sleep here.,00:03:01,00:03:03,181.348,182.821103
1,,,,Get away from me.,00:03:03,00:03:04,182.821103,184.018
2,,,,Why don't you go to an inn?,00:03:04,00:03:06,184.226,185.812
3,,,,Why do you think?,00:03:06,00:03:07,186.02,187.063575
4,,,,Did you knock on doors? Ask people?,00:03:07,00:03:09,187.063575,189.231
5,,,,I asked. I asked everywhere. Leave me alone.,00:03:09,00:03:13,189.439,193.235
6,,,,You didn't ask there.,00:03:13,00:03:15,193.443,194.779
7,,,,Knock on that door.,00:03:19,00:03:21,199.282,200.785
8,,,,Who can that be?,00:03:31,00:03:33,211.253,212.546
9,,,,Do you have any food you can spare?,00:03:38,00:03:40,218.218,220.162818


In [27]:
# saving the dataframe as csv
df.to_csv('x.csv', sep=';', index=False)

# More

# Annotation - calculate t

In [3]:
def calc_t(string, substring, start, end):
    string = ''.join(filter(str.isalpha, string))
    substring = ''.join(filter(str.isalpha, substring))
    
    time_dif = end-start

    string_len = len(''.join(filter(str.isalpha, string)))
    substring_len = len(''.join(filter(str.isalpha, substring)))

    substring_fraction = substring_len/string_len

    substring_t = start+time_dif*substring_fraction

    return substring_t

In [6]:
substring = 'I forget things. '
complete_string = 'I forget things. What have I forgotten?'
t = calc_t(complete_string,substring,3760.6228,3762.801)
print(t)

3761.5362387096775


# Merge rows
## Input: Annotated csv
## Output: Merged annotated csv

In [None]:
# load data
df = pd.read_csv("x.csv", sep=";")

In [None]:
# merge csv rows

df_merged = pd.DataFrame()

i = 0

while i < len(df)-1:

    # current data
    speakers = df.iloc[i]['speakers']
    addressed = df.iloc[i]['addressed']
    recipients = df.iloc[i]['recipients']
    text = df.iloc[i]['text']
    start_hhmmss = df.iloc[i]['start_hhmmss']
    end_hhmmss = df.iloc[i]['end_hhmmss']
    start = df.iloc[i]['start']
    end = df.iloc[i]['end']

    # next data
    next_speakers = df.iloc[i+1]['speakers']
    next_addressed = df.iloc[i+1]['addressed']
    next_recipients = df.iloc[i+1]['recipients']
    next_end_hhmmss = df.iloc[i+1]['end_hhmmss']
    next_text = df.iloc[i+1]['text']
    next_end = df.iloc[i+1]['end']

    # add current data to new df and skip to next loop
    if speakers != next_speakers or addressed != next_addressed or recipients != next_recipients:
        # add data
        new_row = {'speakers':speakers, 'addressed':addressed, 'recipients':recipients, 'text':text, 
                   'start_hhmmss':start_hhmmss, 'end_hhmmss':end_hhmmss, 'start':start, 'end':end}
        new_df = pd.DataFrame([new_row])
        df_merged = pd.concat([df_merged, new_df], ignore_index=True)
        i += 1
        continue

    # merge text until speakers, addresse or recipients changes
    while speakers == next_speakers and addressed == next_addressed and recipients == next_recipients:

        # merge text
        text = text + ' ' + next_text
        end_hhmmss = next_end_hhmmss
        end = next_end

        # update data 
        
        i += 1
        if i == len(df)-1:
            break
            
        next_speakers = df.iloc[i+1]['speakers']
        next_addressed = df.iloc[i+1]['addressed']
        next_recipients = df.iloc[i+1]['recipients']
        next_text = df.iloc[i+1]['text']
        next_end_hhmmss = df.iloc[i+1]['end_hhmmss']
        next_end = df.iloc[i+1]['end']

    # add data
    new_row = {'speakers':speakers, 'addressed':addressed, 'recipients':recipients, 'text':text, 
               'start_hhmmss':start_hhmmss, 'end_hhmmss':end_hhmmss, 'start':start, 'end':end}
    new_df = pd.DataFrame([new_row])
    df_merged = pd.concat([df_merged, new_df], ignore_index=True)

    # update index
    i += 1
    
        
# for index, row in df.iterrows():
#     new_df = new_df._append(row)

df_merged.tail()

In [None]:
# saving the new dataframe as a csv
df_merged.to_csv('x_merged.csv', sep=';', index=False)