In [3]:
# low-level modules
class RawSubtitle:
  """
  Converts a file with .vtt format to python object with some attrs.
  Also tokenizes the .vtt content
  """
  def __init__(self, subtitle_path: str = None, offset: int = 14):
    self.subtitle_path = subtitle_path
    self._lines = None
    self._breaks =  None
    self._set_lines(offset)
    self._set_breaks()

  def _set_lines(self, offset):

     _lines = open(self.subtitle_path).readlines()[offset:]
     _lines[0] = '\n'
     self._lines = _lines

  def _set_breaks(self):
      self._breaks = [i for i in range(len(self._lines)) if self._lines[i] == '\n']

  def tokenized(self) -> list:
    # The algorithm works by selecting the items between every 2 single line breaks
    return [ self._lines[self._breaks[i]:self._breaks[i+1]][1:] for i in range(len(self._breaks)-1) ]


def get_numbers(string: str) -> int:
  _number = ''.join([i for i in string if i.isnumeric()])
  return int(_number)


class Dialog:
  """
  A Dialog object is a representative of every dialog and
  encapsulates dialog data
  """
  def __init__(self, raw_dialog: list):
    self.number = get_numbers(raw_dialog[0])
    self.start = raw_dialog[1][:12]
    self.end = raw_dialog[1][17:29]
    self.metadata = raw_dialog[1][29:]
    self.text = raw_dialog[2:]

  def __dict__(self):
    return  {
        "number":self.number,
        "start": self.start,
        "end": self.end,
        "text": "".join(self.text).replace('\n','')
    }

  def to_dict(self) -> dict:
    dialog_dictionary = {
        "number":self.number,
        "start": self.start,
        "end": self.end,
        "text": self.text
    }


  def __repr__(self) -> str:
    return f'\n{self.number}\n {self.start} --> {self.end} {self.metadata}\n {"".join(self.text)}'


class DialogFactory:
  """
  Factory with list dialogs as the product
  """
  def __init__(self, subtitle_file_path: str) -> None:
    self.subtitle_file_path = subtitle_file_path
    self.dialogs = self.factory()

  def factory(self) -> list:
    _tokens = RawSubtitle(self.subtitle_file_path).tokenized()
    dialogs = [Dialog(token) for token in _tokens]
    return dialogs

  def find_unique_times(self) -> dict:
    time_dict = {}
    _reps = []
    for i in self.dialogs:
      if not i.start in list(time_dict.keys()):
        time_dict.update({i.start:i})
      elif i.start in list(time_dict.keys()):
        _reps.append(i)

    for i in _reps:
      time_dict[i.start].text += i.text
    return time_dict

  def write_to_vtt(self, file_name) -> None:
    with open(f'{file_name}.vtt', 'w') as f:
      for dialog in self.dialogs:
        f.write(f'\n{dialog}')

  def to_dataframe(self):
    return pd.DataFrame([i.__dict__() for i in self.dialogs])



# renumbering: change the number attr in the dialogs
def renumber_dialogs(factory:DialogFactory):
  _times_dic = factory.find_unique_times()
  for i in _times_dic.values():
    l = list(_times_dic.values())
    i.number = l.index(i) + 1
  factory.dialogs = list(_times_dic.values())

In [4]:
#main module

# reset the start and end times in the de version
def reset_de_time(factory:DialogFactory):
  en_dialogs.dialogs =  en_dialogs.dialogs[1:] #English subtitle has one excessive subtitle!
  for i in range(len(de_dialogs.dialogs)):
    de_dialogs.dialogs[i].start = en_dialogs.dialogs[i].start
    de_dialogs.dialogs[i].end = en_dialogs.dialogs[i].end

if __name__ == '__main__':
  print('working...')
  de_dialogs = DialogFactory('de_70105212.vtt')
  en_dialogs = DialogFactory('en_70105212.vtt')
  renumber_dialogs(en_dialogs)
  renumber_dialogs(de_dialogs)
  reset_de_time(de_dialogs)
  en_dialogs.write_to_vtt('enEdited')
  de_dialogs.write_to_vtt('deEdited')
  import pandas as pd

  df_de = de_dialogs.to_dataframe().drop(axis=1, labels='end')
  df_en = en_dialogs.to_dataframe().drop(axis=1, labels='end').loc[:len(df_de)-1]
  pd.concat([df_en, df_de],axis=1).to_excel('subtitle.xlsx')
  # pd.concat([df_en, df_de],axis=1)

working...


In [5]:
import pandas as pd

df_de = de_dialogs.to_dataframe().drop(axis=1, labels='end')
df_en = en_dialogs.to_dataframe().drop(axis=1, labels='end').loc[:len(df_de)-1]
pd.concat([df_en, df_de],axis=1).to_excel('subtitle.xlsx')

In [6]:
pd.concat([df_en, df_de],axis=1)

Unnamed: 0,number,start,text,number.1,start.1,text.1
0,2,00:00:32.700,"<c.white><c.mono_sans>All right, Jim,</c.mono_...",1,00:00:32.700,<c.bg_transparent>SCRANTON BEGRÜSST SIE</c.bg_...
1,3,00:00:33.784,<c.white><c.mono_sans>your quarterlies</c.mono...,2,00:00:33.784,"<c.bg_transparent>Jim, deine Quartalszahlen se..."
2,4,00:00:36.161,<c.white><c.mono_sans>How are things going</c....,3,00:00:36.161,<c.bg_transparent>Wie läuft es mit der Büchere...
3,5,00:00:38.330,"<c.white><c.mono_sans>Oh, I told you.</c.mono_...",4,00:00:38.330,<c.bg_transparent>Ich konnte den Vertrag nicht...
4,6,00:00:40.958,<c.white><c.mono_sans>So you've come</c.mono_s...,5,00:00:40.958,<c.bg_transparent>Also bist du zum Meister gek...
...,...,...,...,...,...,...
471,473,00:17:47.107,<c.white><c.mono_sans>Right?</c.mono_sans></c....,472,00:17:47.107,"<c.bg_transparent>Ja, klar, du auch.</c.bg_tra..."
472,474,00:17:48.317,"<c.white><c.mono_sans>Um, I guess the atmosphe...",473,00:17:48.317,<c.bg_transparent>Viel Spaß dabei.</c.bg_trans...
473,475,00:17:51.821,<c.white><c.mono_sans>is that I'm a friend fir...,474,00:17:51.821,<c.bg_transparent>Kommt mal mit.</c.bg_transpa...
474,476,00:17:53.823,<c.white><c.mono_sans>and a boss second.</c.mo...,475,00:17:53.823,<c.bg_transparent>DER BESTE BOSS DER WELT</c.b...
