## Configuartion

In [None]:
%load_ext autoreload

import os
import sys

home = os.getenv("HOME")
use_gdrive = True

## Setup Directories

In [None]:
ds_to_use = ['HP1-FvM']

local_nlp_base_dir = f'{home}/shared'
local_data_base_dir = f'{local_nlp_base_dir}/NLP-Data'
local_data_dir = f'{local_data_base_dir}/audio'
extern_nlp_base_dir = None
extern_data_base_dir = None

runs_on_colab = (home == '/root')
print( f'runs on colab: {runs_on_colab}')

if not runs_on_colab:
    os.environ['http_proxy'] = 'http://192.168.8.50:3128'
    os.environ['https_proxy'] = 'http://192.168.8.50:3128'
else:
    # to get access to the datasets we use gdrive
    use_gdrive = True
    # install packages
    !pip install jiwer
    # create local directories
    !mkdir $local_nlp_base_dir
    !mkdir $local_data_base_dir

if use_gdrive:
    gdrive_base = '/content/gdrive'    
    extern_nlp_base_dir = f'{gdrive_base}/MyDrive'
    extern_data_base_dir = f'{extern_nlp_base_dir}/NLP-Data'
    extern_data_dir = f'{extern_data_base_dir}/audio'

    if not os.path.isdir(gdrive_base):
        from google.colab import drive
        drive.mount(gdrive_base)

if not os.path.isdir(local_data_dir):
    !mkdir $local_data_dir

if extern_nlp_base_dir:
    model_dir = f'{extern_nlp_base_dir}/NLP-Models/GermanWave2Vec'
else:
    model_dir = f'{local_nlp_base_dir}/NLP-Models/GermanWave2Vec'

git_views_dir = f'{local_nlp_base_dir}/gitviews'

if not os.path.isdir(git_views_dir):
    !mkdir $git_views_dir
    !cd $git_views_dir; git clone https://github.com/ElUnrast/GermanWave2Vec.git

runs on colab: True
mkdir: cannot create directory ‘/root/shared’: File exists
mkdir: cannot create directory ‘/root/shared/NLP-Data’: File exists


In [None]:
if runs_on_colab:
    git_view_path = f'{git_views_dir}/GermanWave2Vec'
    # !cd $git_view_path; git fetch --all; git reset --hard origin/main
    !cd $git_view_path; git fetch origin main; git reset --hard origin/main

script_path = f'{git_views_dir}/GermanWave2Vec/python'
sys.path.insert(0, script_path)

From https://github.com/ElUnrast/GermanWave2Vec
 * branch            main       -> FETCH_HEAD
HEAD is now at 218586c Erstellt mit Colaboratory


## Install packages and do Imports

In [None]:
import numpy as np
import pandas as pd
import sklearn
import jiwer
from jiwer import wer
from tqdm.notebook import tqdm_notebook
from sklearn.model_selection import train_test_split

In [None]:
%aimport SnippetDatasets
from SnippetDatasets import SnippetDatasets, calc_wer

## Initialize Helper Classes

In [None]:
my_datasets = SnippetDatasets(
    runs_on_colab, 
    local_audio_base_dir=local_data_dir, 
    extern_audio_base_dir=extern_data_dir
)

## Choose Dataset

In [None]:
all_ds_ids = set()
all_ds_ids.update(list(my_datasets.local_datasets.keys()))
all_ds_ids.update(list(my_datasets.extern_datasets.keys()))

import difflib
from functools import partial

import ipywidgets as widgets
from IPython.display import display, Audio

ds_checkbox_items = []

for ds_id in sorted(all_ds_ids):
    ds_checkbox_items.append(widgets.Checkbox(
        value=ds_id in ds_to_use, 
        description=ds_id, 
        disabled=False, 
        indent=False
    ))

def get_used_datasets(checkbox_items=ds_checkbox_items):
    result = []

    for item in checkbox_items:
        if item.value:
            result.append(item.description)

    return result

print('Please choose Datasets to use')
ds_checkboxes_widget = widgets.Box(ds_checkbox_items)
display(ds_checkboxes_widget)

Please choose Datasets to use


Box(children=(Checkbox(value=True, description='HP1-FvM', indent=False), Checkbox(value=False, description='HP…

In [None]:
from IPython.display import (Audio, display, clear_output)
from ipywidgets import widgets
from functools import partial
from SnippetDatasets import calc_wer

def show_diff(text, n_text):
    """
    http://stackoverflow.com/a/788780
    Unify operations between two compared strings seqm is a difflib.
    SequenceMatcher instance whose a & b are strings
    """
    seqm = difflib.SequenceMatcher(None, text, n_text)
    output= []
    output.append('<p style="font-size:60%;">')
    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == 'equal':
            output.append(seqm.a[a0:a1])
        elif opcode == 'insert':
            output.append("<font color=red>^" + seqm.b[b0:b1] + "</font>")
        elif opcode == 'delete':
            output.append("<font color=blue>^" + seqm.a[a0:a1] + "</font>")
        elif opcode == 'replace':
            # seqm.a[a0:a1] -> seqm.b[b0:b1]
            output.append("<font color=green>^" + seqm.b[b0:b1] + "</font>")
        else:
            raise RuntimeError("unexpected opcode")

    output.append('</p>')
    return ''.join(output)
    
class InvisibleAudio(Audio):
    def _repr_html_(self):
        audio = super()._repr_html_()
        audio = audio.replace('<audio', f'<audio onended="this.parentNode.removeChild(this)"')
        return f'<div style="display:none">{audio}</div>'

def create_diff_row(audio_file, translated_text, original_text ):
    diff_widget = widgets.HTML(value=show_diff(original_text, translated_text))
    button_widget = widgets.Button(description='Play')

    def on_play_button_clicked(b):
        # print(f'playing {audio_file}')
        display(InvisibleAudio(filename=audio_file, autoplay=True))

    button_widget.on_click(on_play_button_clicked)
    return widgets.HBox([button_widget, diff_widget])

def create_diff_content(ds_id, ds):
    snipped_directory = my_datasets.get_snippet_directory(ds_id)
    all = len(ds)
    translation_row = 'Translated1' if 'Translated1' in ds.columns else 'Translated0'
    ds = ds[ds['OriginalText'] != ds[translation_row]]
    wrong = len(ds)
    print(f'Use Snipped Directory: {snipped_directory} - {all}/{wrong} Wrong')
    rows = []
    max_len = min(50, len(ds))

    for idx in tqdm_notebook(range(max_len)):
        rows.append(create_diff_row(
            f'{snipped_directory}/{ds.iloc[idx]["Datei"]}', 
            ds.iloc[idx]['OriginalText'], 
            ds.iloc[idx][translation_row]
        ))

    return widgets.VBox(rows)

def create_validation_tab():
    print('Validate')
    tab = widgets.Tab()
    tab_titles = []
    tab_children = []

    for ds_id in get_used_datasets():
        ds = my_datasets.load_ds_content_translated_with_original(ds_id)
        wer = 100 * calc_wer(ds)
        tab_titles.append(f'{ds_id} - {wer:3.4f}')
        tab_children.append(create_diff_content(ds_id, ds))

    tab.children = tab_children
  
    for idx, name in enumerate(tab_titles):
        tab.set_title(idx, name)
  
    return tab

display(create_validation_tab())


Validate
Loading Dataset: HP1-FvM - content-translated-with_original.csv
Download and extract /content/gdrive/MyDrive/NLP-Data/audio/HP1-FvM.zip from gdrive
Pruning Dataset HP1-FvM with 14465 Entries
 - 14465 Entries left after Length Cut (min=31, max=4000)
 - 8055 Entries left after Action Cut
Dataset was truncated from 14465 to 8055 Entries. Saving Backup.
Use Snipped Directory: /root/shared/NLP-Data/audio/HP1-FvM - 8055/3869 Wrong


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




Tab(children=(VBox(children=(HBox(children=(Button(description='Play', style=ButtonStyle()), HTML(value='<p st…