` !pip install webvtt-py ` https://github.com/glut23/webvtt-py

`!pip install XlsxWriter`

Delete empty text after timestamp in the sbv files.

In [1]:
import pandas as pd
import numpy as np
import webvtt
from datetime import datetime
from difflib import SequenceMatcher

#Chinese sbv file
InOriginal = "C:\\Users\\Jiachen\\OneDrive\\YouTube Subtitles\\Youtube LeTV Published\\4509 Templates with Chinese subs\\Empress In The Palace (YouTube Template) - E20 - converted.sbv"
#English translation
InTranslation = "C:\\Users\\Jiachen\\OneDrive\\YouTube Subtitles\\Youtube LeTV Published\\EP20volunteers.sbv"
#Revised translation
InRevised = "C:\\Users\\Jiachen\\OneDrive\\YouTube Subtitles\\Youtube LeTV Published\\EP20Carsen.sbv"
#Output file
OutFile = "C:\\Users\\Jiachen\\OneDrive\\YouTube Subtitles\\测试 培训\\Carsen修改前后对比\\EP20前后对照(汉英完整版).xlsx"

In [2]:
def sbv2df(sbv,textCol):
    """ 
    Store (start, end, and text) of each time segment in the sbv file in a row of a pandas dataframe.
    Input args 
        sbv (string): the file path of an sbv file
        textCol (string): the name of the text column
    """
    data = []
    global webvtt
    webvtt = webvtt.from_sbv(sbv)
    for caption in webvtt:
        data.append({'start':datetime.strptime(caption.start,'%H:%M:%S.%f').time(), 
                     'end':datetime.strptime(caption.end,'%H:%M:%S.%f').time(),
                     textCol:caption.text})
    df = pd.DataFrame(data)
    df = df.replace('\n',' ', regex=True)
    df = df[['start','end',textCol]] 
    return df   

In [3]:
original = sbv2df(InOriginal,"Chinese")
translation = sbv2df(InTranslation,"Translation")
revised = sbv2df(InRevised,"Revised")

In [4]:
o = original.set_index(['start','end'])
t = translation.set_index(['start','end'])
r = revised.set_index(['start','end'])
output = t.join(r, how='outer')
output[13:23]

Unnamed: 0_level_0,Unnamed: 1_level_0,Translation,Revised
start,end,Unnamed: 2_level_1,Unnamed: 3_level_1
00:02:08,00:02:17,Who can break the spell of romance?,Who can break the spell of romance?
00:02:17,00:02:22,[contact translators] carsenwei@yahoo.com [sub...,[contact translators] carsenwei@yahoo.com [sub...
00:02:25.180000,00:02:28.620000,I bow in respect before Lady Sourire.,
00:02:26.420000,00:02:29.860000,,I bow in respect before Lady Sourire.
00:02:30.820000,00:02:32.580000,"What are you carrying, Mr Jiang?",
00:02:30.820000,00:02:33.480000,,"What is it, sir, that you have to carry person..."
00:02:34.100000,00:02:36.720000,His majesty has specially bestowed this to you.,
00:02:34.100000,00:02:38.440000,,"His Majesty specially sends this to you, My Lady."
00:02:39.900000,00:02:40.840000,"Look, my Lady!",
00:02:39.900000,00:02:41.760000,,"My Lady, have a look if you please."


```python
lastStart = output.index[0][0]
for index, row in output.iterrows():
    if index[0] != output.index[0][0]:  #not datetime.time(0, 0)
        if index[0] != lastStart: #different start time from last time segment
            lastStart = index[0]
            print("different start time")
            print(index[0],index[1])
            print(row[0],row[1],'\n')
        elif index[0] == lastStart: #same start time with last time segment
            print("same start time")
            print(index[0],index[1])
            print(row[0],row[1],'\n')
```
Example output below

different start time
00:02:25.180000 00:02:28.620000
I bow in respect before Lady Sourire. nan 

different start time
00:02:26.420000 00:02:29.860000
nan I bow in respect before Lady Sourire. 

different start time
00:02:30.820000 00:02:32.580000
What are you carrying, Mr Jiang? nan 

same start time
00:02:30.820000 00:02:33.480000
nan What is it, sir, that you have to carry personally? 

In [5]:
# For each time sgegment with the same start time, replace Translation or Revised with the last non-NaN value

lastStart = output.index[0][0]
for index, row in output.iterrows():
    if index[0] != output.index[0][0]:  #not datetime.time(0, 0)
        if index[0] != lastStart: #different start time from last time segment
            lastStart = index[0]
            lastTranslation = row[0]
            lastRevised = row[1]
        elif index[0] == lastStart: #same start time with last time segment
            if pd.isna(row[0]) and ~pd.isna(lastTranslation):
                row[0] = lastTranslation
            if pd.isna(row[1]) and ~pd.isna(lastRevised):
                row[1] = lastRevised
output[13:23]

Unnamed: 0_level_0,Unnamed: 1_level_0,Translation,Revised
start,end,Unnamed: 2_level_1,Unnamed: 3_level_1
00:02:08,00:02:17,Who can break the spell of romance?,Who can break the spell of romance?
00:02:17,00:02:22,[contact translators] carsenwei@yahoo.com [sub...,[contact translators] carsenwei@yahoo.com [sub...
00:02:25.180000,00:02:28.620000,I bow in respect before Lady Sourire.,
00:02:26.420000,00:02:29.860000,,I bow in respect before Lady Sourire.
00:02:30.820000,00:02:32.580000,"What are you carrying, Mr Jiang?",
00:02:30.820000,00:02:33.480000,"What are you carrying, Mr Jiang?","What is it, sir, that you have to carry person..."
00:02:34.100000,00:02:36.720000,His majesty has specially bestowed this to you.,
00:02:34.100000,00:02:38.440000,His majesty has specially bestowed this to you.,"His Majesty specially sends this to you, My Lady."
00:02:39.900000,00:02:40.840000,"Look, my Lady!",
00:02:39.900000,00:02:41.760000,"Look, my Lady!","My Lady, have a look if you please."


In [6]:
# Clean output (df): drop the rows like the following in the above output
    # 00:02:30.820000	00:02:32.580000
    # 00:02:34.100000	00:02:36.720000
#But keep the rows like 
    # 00:02:25.180000	00:02:28.620000
    # 00:02:26.420000	00:02:29.860000

lastStart = output.index[0][0]
clean=output
for index, row in output.iterrows():
    if index[0] != output.index[0][0]:  #not datetime.time(0, 0)
        if index[0] != lastStart: #different start time from last time segment
            lastStart = index[0]      
            #print("START",index[0],'\n',output.loc[index[0]].shape)  #to find out the pattern in shape
            if output.loc[index[0]].shape[0] == 2:   #one start (index) matches 2 end (index)
                clean.drop((index[0],index[1]), inplace=True)
clean[13:23]

Unnamed: 0_level_0,Unnamed: 1_level_0,Translation,Revised
start,end,Unnamed: 2_level_1,Unnamed: 3_level_1
00:02:17,00:02:22,[contact translators] carsenwei@yahoo.com [sub...,[contact translators] carsenwei@yahoo.com [sub...
00:02:25.180000,00:02:28.620000,I bow in respect before Lady Sourire.,
00:02:26.420000,00:02:29.860000,,I bow in respect before Lady Sourire.
00:02:30.820000,00:02:33.480000,"What are you carrying, Mr Jiang?","What is it, sir, that you have to carry person..."
00:02:34.100000,00:02:38.440000,His majesty has specially bestowed this to you.,"His Majesty specially sends this to you, My Lady."
00:02:39.900000,00:02:41.760000,"Look, my Lady!","My Lady, have a look if you please."
00:02:41.900000,00:02:47.900000,,The sole is made of a famous variety of Lantia...
00:02:42.080000,00:02:47.020000,The jade that makes the sole is the famous var...,
00:02:48.340000,00:02:53.220000,The inside of the shoe is hollow and contains ...,The shoes were hollowed and filled with assort...
00:02:52.040000,00:02:54.120000,,"His Majesty calls it ""fragrant steps""."


In [7]:
#Calculate word change ratio (0-1) in clean (df)

clean['WordChange'] =  np.nan
for idx,row in clean.iterrows():
    if pd.isna(row['Translation']):
        row['Translation'] = ''
    if pd.isna(row['Revised']):
        row['Revised'] = ''
    clean.loc[idx,'WordChange'] = 1- SequenceMatcher(None,row['Translation'],row['Revised']) .ratio()
clean[13:23]

Unnamed: 0_level_0,Unnamed: 1_level_0,Translation,Revised,WordChange
start,end,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00:02:17,00:02:22,[contact translators] carsenwei@yahoo.com [sub...,[contact translators] carsenwei@yahoo.com [sub...,0.0
00:02:25.180000,00:02:28.620000,I bow in respect before Lady Sourire.,,1.0
00:02:26.420000,00:02:29.860000,,I bow in respect before Lady Sourire.,1.0
00:02:30.820000,00:02:33.480000,"What are you carrying, Mr Jiang?","What is it, sir, that you have to carry person...",0.542169
00:02:34.100000,00:02:38.440000,His majesty has specially bestowed this to you.,"His Majesty specially sends this to you, My Lady.",0.25
00:02:39.900000,00:02:41.760000,"Look, my Lady!","My Lady, have a look if you please.",0.755102
00:02:41.900000,00:02:47.900000,,The sole is made of a famous variety of Lantia...,1.0
00:02:42.080000,00:02:47.020000,The jade that makes the sole is the famous var...,,1.0
00:02:48.340000,00:02:53.220000,The inside of the shoe is hollow and contains ...,The shoes were hollowed and filled with assort...,0.657459
00:02:52.040000,00:02:54.120000,,"His Majesty calls it ""fragrant steps"".",1.0


In [8]:
output = o.join(clean, how='outer')
output[13:23]

Unnamed: 0_level_0,Unnamed: 1_level_0,Chinese,Translation,Revised,WordChange
start,end,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00:02:17,00:02:22,[contact translators] carsenwei@yahoo.com [sub...,[contact translators] carsenwei@yahoo.com [sub...,[contact translators] carsenwei@yahoo.com [sub...,0.0
00:02:25.180000,00:02:28.620000,,I bow in respect before Lady Sourire.,,1.0
00:02:25.760000,00:02:27.350000,奴才给莞贵人请安,,,
00:02:26.420000,00:02:29.860000,,,I bow in respect before Lady Sourire.,1.0
00:02:29.600000,00:02:30.870000,什么了不得的东西,,,
00:02:30.820000,00:02:33.480000,,"What are you carrying, Mr Jiang?","What is it, sir, that you have to carry person...",0.542169
00:02:31.120000,00:02:32.110000,要公公这样端着,,,
00:02:33.040000,00:02:35.030000,这是皇上特意赐予小主的,,,
00:02:34.100000,00:02:38.440000,,His majesty has specially bestowed this to you.,"His Majesty specially sends this to you, My Lady.",0.25
00:02:35.480000,00:02:36.710000,小主一看便知,,,


In [9]:
#Reset index so that "start" and "end" will appear in the Excel file
df = output.reset_index(level=['start','end'])

# Write to Excel file with formats
writer = pd.ExcelWriter(OutFile, engine='xlsxwriter') #https://xlsxwriter.readthedocs.io/index.html
df.to_excel(writer, sheet_name='Sheet1', index=False)
# Get the xlsxwriter objects from the dataframe writer object.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# Set the column width and format.
format1 = workbook.add_format({'text_wrap': True})
worksheet.set_column('A:B', 12)
worksheet.set_column('C:E', 38, format1)
worksheet.set_column('F1:F1048576', 5)

# Conditional formatting based on word change %
    # https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
    #colors https://xlsxwriter.readthedocs.io/working_with_colors.html

# Green fill with dark green text.
format2 = workbook.add_format({'bg_color':   '#C6EFCE',
                               'font_color': '#006100'})
# Light red fill with dark red text.
format3 = workbook.add_format({'bg_color':   '#FFC7CE',
                               'font_color': '#9C0006'})

# Light yellow fill with dark yellow text.
format4 = workbook.add_format({'bg_color':   '#FFEB9C',
                               'font_color': '#9C6500'})


worksheet.conditional_format('F1:F1048576', {'type':     'cell',
                                        'criteria': '<',
                                        'value':    0.4,
                                        'format':   format2})

worksheet.conditional_format('F1:F1048576', {'type':     'cell',
                                        'criteria': 'between',
                                        'minimum':  0.4,
                                        'maximum':  0.99,
                                        'format':   format3})

worksheet.conditional_format('F1:F1048576', {'type':     'cell',
                                        'criteria': '>',
                                        'value':    0.99,
                                        'format':   format4})

# Close the Pandas Excel writer and output the Excel file.
writer.save()



Other functions that might be useful to do the cleaning

```python
output.count(level='start')[12:19] #a df of 1s and 0s
output.count(level='start').iloc[14] #This is a pandas series
output.count(level='start').iloc[14].name #datetime.time(0, 2, 25, 180000)
0 in output.count(level='start').iloc[14].values #True

idx022518 = output.count(level='start').iloc[14].name
output.loc[idx022518] #df with 1 row

idx023082 = output.count(level='start').iloc[16].name
output.loc[idx023082] #df with 2 rows

df = output.loc[idx023082]
df.fillna(method='ffill') #forward fill; 'bfill' for backward fill
```

Iterate a MultiIndex DataFrame by index and row

```python
for index, row in output.iterrows():
    print(index[0])
    print(type(row[0]),type(row[1]))
    print(row[0])
    print(row[1],"\n")
```
