In [12]:
import pandas as pd
import numpy as np

mer = pd.read_json("021_JV_segments_with_metadata.jsonl", orient="records", lines=True)
mer = mer[~mer.file.str.contains("_BzZf0fGg0E")]
corp = pd.read_json("020_JV_with_metadata.jsonl", orient="records", lines=True)

def corpus_handler(s:str) -> str:
    """For corpus transcripts."""
    from string import punctuation
    for p in punctuation:
        s = s.replace(p, "")
    return " ".join(
        s.replace("JV:", "").split()
    ).casefold()

def segment_handler(s:str) -> str:
    return s.replace("<anchor_start>", "").replace("<anchor_end>", "")

def find_optimal_subset(segment, full, step=1, limit = None):
    from fuzzywuzzy.fuzz import ratio
    from tqdm.auto import tqdm
    segment = segment.split()
    full = full.split()
    best = 0
    best_start, best_end = 0, -1
    tqdm = lambda x: x
    for start in tqdm(range(0, len(full) if not limit else 2*limit, step)):
        for end in range(len(full)- 2*limit if limit else start, len(full), step):
            try:
                subset = full[start:end]
                current = ratio(
                    segment_handler(" ".join(segment)),
                    corpus_handler(" ".join(subset))
                        )
                # print(start, end, current, subset)
                if current >= best:
                    best_start, best_end = start, end
                    best = current
            except IndexError:
                continue
    if step != 1:
        best_start = max((best_start - step, 0))
        best_end   = min((best_end   + step, len(full)))
    return " ".join(
        full[best_start:best_end]
    )

In [13]:
i = 300
segments_file = mer.file.values[i]
full_transcript = corp[corp.path == segments_file].transcript.values[0]
segments_kaldi_transcript = mer.kaldi_transcript.values[i]

It has been found out that the best course of action is to set up a step-down strategy: instead of matching words to words, it is best to do it first on hecto-words and then only identify word level matches when the best candidate string is far shorter.

Specifically, when doing 100->1 step down, the execution time drops from 5h to 4s, which is a speed-up of more than 36 dB!

Since this was not fast enough, I also implemented a three step version, and the algorithm has been corrected so that after first iteration we only trim the ends of the string.

In [14]:
coarse = find_optimal_subset(segments_kaldi_transcript, full_transcript, step=100)
medium = find_optimal_subset(segments_kaldi_transcript,  coarse, step=10, limit=100)
fine = find_optimal_subset(segments_kaldi_transcript,  medium, step=1, limit=10)

In [15]:
fine

'otklonili te probleme? 364 zdravstvene ustanove postoje u planu mreže u Srbiji. 14 ih ima u Nišu. Ukoliko kupujete 36 olovke, to je po novom zakonu, biće jeftinije nego ako kupujete 1. Isto tako se pokazalo se, pokazalo se i par problema, o kojima ja sada govorim,'

In [16]:
segments_kaldi_transcript

'otklonili te probleme<anchor_end> tristo šezdeset četiri zdravstvene ustanove postoje u planu mreže u srbiji četrnaest ih ima u nišu ukoliko kupujete trideset šest olovke to je po novom zakonu biće jeftinije nego ako kupujete jedan isto tako se pokazalo se pokazalo se i par problema o kojima ja sada govorim'

# running it full scale

In [20]:



def match_kaldi(row):
    segments_file = row["file"]
    full_transcript = corp[corp.path == segments_file].transcript.values[0]
    segments_transcript = row["kaldi_transcript"]

    coarse = find_optimal_subset(segments_transcript, full_transcript, step=100)
    medium = find_optimal_subset(segments_transcript,  coarse, step=10, limit=100)
    fine = find_optimal_subset(segments_transcript,  medium, step=1, limit=10)
    return fine

# for i, row in mer.iterrows():
#     kaldi = match_kaldi(row)
#     mer.loc[i, "Raw_transcript__matched_on_kaldi"] = kaldi

from concurrent.futures import ProcessPoolExecutor

with ProcessPoolExecutor(max_workers=32) as executor:
    results = executor.map(match_kaldi, [row for i, row in mer.iterrows()])
mer["Raw_transcript__matched_on_kaldi"] = results

KeyboardInterrupt: 

In [None]:
mer.to_json("025_segments_matched_with_raw.jsonl", orient="records", lines=True)

In [None]:
def match_asr(row):
    segments_file = row["file"]
    full_transcript = corp[corp.path == segments_file].transcript.values[0]
    segments_transcript = row["asr_transcription"]

    coarse = find_optimal_subset(segments_transcript, full_transcript, step=100)
    medium = find_optimal_subset(segments_transcript,  coarse, step=10, limit=100)
    fine = find_optimal_subset(segments_transcript,  medium, step=1, limit=10)
    return fine


from concurrent.futures import ProcessPoolExecutor

with ProcessPoolExecutor(max_workers=32) as executor:
    results = executor.map(match_kaldi, [row for i, row in mer.iterrows()])
mer["Raw_transcript__matched_on_asr"] = results

In [None]:
mer.to_json("025_segments_matched_with_raw.jsonl", orient="records", lines=True)