# Aufbereitung des Transcripted-Train-Datensatzes
Dieses Notebook bündelt die Skripte zur Bereinigung der Trainingsdaten, dokumentiert sämtliche Filterlisten und stellt Hilfsfunktionen für wiederholbare Analysen bereit. Alle Schritte sind so beschrieben, dass sie ohne zusätzlichen Kontext nachvollziehbar bleiben.


## Notebook-Überblick
- Importiert Statistik- und Audio-Helfer zur Untersuchung der Rohdaten.
- Definiert nachvollziehbare Qualitätslisten für problematische Nutzer, Transkripte und Sonderfälle.
- Lädt die TSV-Dateien, berechnet Kennzahlen und exportiert bereinigte Varianten.


In [None]:
# === Distanzmetriken & Textnormalisierung ===
import werpy  # Normalisiert Text, berechnet aber kein CER
from jiwer import cer, wer  # Liefert CER/WER auf normalisiertem Text
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
smooth = SmoothingFunction().method1

# === Allgemeine Bibliotheken ===
import pandas as pd
import os
import numpy as np
import json
import time
from tqdm import tqdm  # Fortschrittsbalken

# === Audio-Handling ===
import soundfile as sf
from IPython.display import Audio, display

# === Visualisierung ===
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from collections import Counter

# === Clustering ===
from sklearn.cluster import DBSCAN, HDBSCAN
from sklearn.preprocessing import StandardScaler
# import hdbscan


In [None]:
# compare Metrics
ref = 'ich bin gross und stark' # original sentence
hyp = 'bin ich so gross und stark' # hypothese to evaluate

# WER and CER: (Substitutions+Insertions+Deletions)/N (Levenshtein Distance)
wer_score = werpy.wers(ref, hyp)
print(f'WER: {wer_score} (0 = best, 1 = same N but every word was replaced)')

cer_score = cer(ref, hyp)
print(f"CER: {cer_score} (0 = best, 1 = same N but every char was replaced)")

# TP = words existing in ref and hyp, (TP+FP) = N_ref = number of words in ref, (TP+FN) = N_hyp = number of words in hyp
# Precision: TP/(TP+FP) = TP/N_ref
# Recall: TP/(TP+FN) =  TP/N_hyp
# n-grams: sequence of n-words (e.g. 'ich bin', 'bin gross' ...)
# BP = Brevity-Penalty (because a short hyp, having exactly the words in ref but not all leads into 100% BLEU)
# if N_hyp > N_ref: BP = 1; if N_hyp <= N_ref: BP = e^(1-(N_ref/N_hyp))
# BLEU --> penalizes short hyp
# BLEU = BP*((1-gram Precision)^w1 * (2-gram Precision)^w2 * ...)
# weights w: tells how much weight each n-gram has. idealy it sums up to 1
# because sentences are "only" 6-12 words long, default BLEU of up to 4-gram is not usefull, 2-gram is used weights of 3-gram and 4-gram are set to 0 (0.5, 0.5, 0, 0)
bleu_score_1gram = sentence_bleu([ref.split(' ')], hyp.split(' '), weights=(1, 0, 0, 0), smoothing_function=smooth)
print(f"bleu 1-gram: {bleu_score_1gram}")

bleu_score_2gram = sentence_bleu([ref.split(' ')], hyp.split(' '), weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
print(f"bleu 2-gram: {bleu_score_2gram}")

bleu_score_3gram = sentence_bleu([ref.split(' ')], hyp.split(' '), weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth)
print(f"bleu 3-gram: {bleu_score_3gram}")

bleu_score_4gram = sentence_bleu([ref.split(' ')], hyp.split(' '), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
print(f"bleu 4-gram: {bleu_score_4gram}")



In [None]:
BASE_PATH = os.getcwd()
INPUT_FILENAME = "transcripted_train_all.tsv"
INPUT_PATH = os.path.join(BASE_PATH, "datasets", "STT4SG-350", "Data_300", "Data_300", "train_all.tsv")
OUTPUT_PATH = os.path.join(BASE_PATH, "transcripts", "transcripts_tsv", "tsv_350", INPUT_FILENAME) 
JSON_PATH = os.path.join(BASE_PATH, "transcripts", "transcripts_json", "json_350")
AUDIO_PATH = os.path.join(BASE_PATH, "datasets", "STT4SG-350", "Data_300", "Data_300", "clips__train_valid")

# Thresholds to remove outliers and corrupt data
WER_threshold = 0.94                      # keep < , zwischen 1 und 5 hat es sehr gute beispiele abre auch absolut nicht brauchbare, siehe unten
CER_threshold = 1.0                      # keep < , zwischen 1 und 5 hat es sehr gute beispiele abre auch absolut nicht brauchbare, siehe unten
BLEU1_threshold = 0.1                   # keep >
BLEU2_threshold = 0.05                   # keep >
transc_len_threshold_low = 11           # keep >
transc_len_threshold_high = 160         # keep <
transc_count_threshold_low = 2          # keep >, teilweise gute Sätze dabei, daher fraglich wie sinnvoll das zu filtern, bei 3...5 sind aber auch defekte audios dabei
len_difference_threshold_low = -35      # keep > bis -37 sind es ausschliesslich unvollständige audio-clips, darüber dann gute samples mit CH satzstellungen aber auch noch vereinzelte schlechte clips
len_difference_threshold_high = 31      # keep < , unter 31, gehen gute schweizerdeutsche Satzstellungen verloren
len_diff_norm_threshold_low = -0.46      # keep >, Ausgeschriebene Zahlen und Wörter (z.B. prozent) in sentence vs. Ziffern (Abkürzungen z.b. % ) im Transcript, führen zu hohen negativen Werten
len_diff_norm_threshold_high = 0.65      # keep <, hier umgekehrt, Ziffern in sentence, ausgeschriebene Zahlen in transcript

audiosize_threshold = 38400             # in Bytes, keep > , 
duration_threshold = 1.9                # in seconds, keep > , Wert aus dem histogramm bestimmt, dort gibt es eine klare abgrenzung von kurzen clips
confidence_threshold = 0.42            # keep >, confidence NaN werden weiter unten mit -1 ersetzt. das sind transcripts mit der Länge 0, diese können gefiltert werden


## Manuelle Qualitätsfilter
Die Listen in diesem Abschnitt bündeln Nutzerinnen und Nutzer mit dauerhaft schlechter Audioqualität, typische Fehltranskripte und weitere Sonderfälle. Dadurch lässt sich jeder Filterentscheid vor dem eigentlichen Export transparent belegen.


In [None]:
# users where some files are from bad quality but not all, problem: approx 1100 clipes per user
low_quality_users = [
                    '1add488f-cc2b-41ad-b40d-ca96bfac5d26', 
                    '7b009b41-3f12-4757-944d-a12e4d33fba1',
                    '08859fe6-2f51-4deb-ad9a-e4ad1aea56fa',
                    '485dbe58-ff41-4008-a803-520e5244768a', 
                    ]


In [None]:
# some invalid clips create common transcripts, those transcripts are removed
common_transcripts = [
                        'so',                           # 1x short duration audioclip, bad quality 
                        'okay tschüss',                 # 1x short duration audioclip, bad quality  
                        'tschüss',                      # 35x  crazy robosound vom user 1add488f-cc2b-41ad-b40d-ca96bfac5d26
                        'vielen dank',                  # 34x bad audio quality leads into this common transcript
                        'vielen dank fürs zuschauen',    # 12x bad audio quality, vallis, robosound
                        'bis zum nächsten mal',         # 10x bad audio quality, vallis, robosound
                        'vielen dank fürs zuschauen und bis zum nächsten mal',  # 9x bad audio quality, vallis
                        'vielen dank für die aufmerksamkeit',                   # 6x bad audio quality, vallis
                        'vielen dank für ihre aufmerksamkeit',                  # 6x bad audio quality, vallis
                        'äh äh äh'                      # 5x robosound
                        'das wars für heute wir sehen uns beim nächsten mal',                          # 4x bad audio quality, vallis
                        'das wars für heute bis zum nächsten mal',                                     # 4x bad audio quality, vallis
                        'untertitelung des zdf 2020',                                                  # 4x small audosize, bad qualityi
                        'äh äh äh äh',                                                                 # 3x robosound
                        'danke',                        # bad audio
                        'amen',                         # short clips, bad quality, 
                        'ja',                           # noises, music
                        'untertitelung br 2018',        # robo sound
                        'das ist das wichtigste',       # short duration audioclip, bad quality, robosound
                        'ciao',                         # short duration audioclip robosound
                        'das wars für heute',           # bad audio quality  
                        'das wars und wir sehen uns in der nächsten folge', 
                        '',
                        ]

# may some signs appear for defect audios
special_signs = [
                '*',
                '#',
                '(',
                ')',
                '@',
                '"',
                '...',
                ]                    

In [None]:
# audios are clearly understandable, but the resulting transcript is a common sentence not fitting to the audios (different sentences)
# nothing is filtered around this yet
wrong_transcripts = []

In [None]:
# following clips have a WER between 2...2.5 we want to keep swissgerman but not the others (lists are only a selection and not complete)
# this data is not used yet anywhere

# sentences which are spoken quite different in swiss german, but correct, may what we want, but those will be filtered 
# user 61b3a994-0787-4f02-915e-58740d1cd016 ist wahrscheinlich walliser, dieser Person hat die sätze teilweise stark verändert
swissgerman = [] 


# clips listed here could not be filtered in any other way than WER, what also leads to filtering swissgerman clips
# folgende Listen können bei Bedarf erweitert werden, diese werden dann aus dem df rausgenommen

# some guys repeat, correct the sentence in one audio clip: 
correctors = []

# some users do not like the sentence, so they say why they do not read that
zu_kompliziert = []

# audio is clear but not matching the sentence
random_sentence = []     

# long clips in bad quality
long_bad_quality = []

# concat lists to remove
remove = correctors+zu_kompliziert+random_sentence+long_bad_quality

In [None]:
def print_clips(df_sorted, start=0, n_clips=5):
    '''Function for plotting information about clip, including sentence, transcript and Playable Audio'''
    end = start+n_clips
    counter = -1

    for i, row in df_sorted.iterrows():
        if row['WER'] == 0:
            continue

        counter += 1
        if counter < start:
            continue
        if counter >= end:
            break

        print(f'WER: {row['WER']:.3f} | CER {row['CER']:.3f} | BLEU 1-gram {row['BLEU_1gram']:.3f} |  BLEU 2-gram {row['BLEU_2gram']:.3f}  | '
              f'confidence {row['confidence']:.3f} | transcript_len: {row['transcript_len']} | difference in length: {row['len_difference_norm']:.2f}% ({row['len_difference']} chars)')
        print(f'sentence       {row['sentence']} | ({row['sentence_norm']})')
        print(f'transcript     {row['transcript']} | ({row['transcript_norm']})\n')
        print(f'age: {row['age']} | gender: {row['gender']} | sentence_source: {row['sentence_source']} | dialect_region: {row['dialect_region']} | canton: {row['canton']}')
        print(f'{row['audiosize']/(10**3):.3f} kB | {row['duration']:.2f} sec | clip path: {row['clip_path']}')
        clip_path = os.path.join(AUDIO_PATH, row['clip_path'])
        try:
            data, samplerate = sf.read(clip_path)
            display(Audio(data=data, rate=samplerate))
        except Exception as e:
            print(f"[Fehler beim Abspielen: {e}]")
        print('\n')


def get_file_size(clip_path):
    '''Function for loading Filesize, gives a hint on defect Audio Files'''
    return os.path.getsize(os.path.join(AUDIO_PATH, clip_path))


def is_in_string(target_string, certain_string_list):
    '''function to filter string chunks being part of a string'''
    for string in certain_string_list:
        if string in target_string:
            return True
    return False

def is_certain_string(target_string, certain_string_list):
    '''function to filter strings matching target exactly'''
    for string in certain_string_list:
        if string == target_string:
            return True
    return False


In [None]:
df_temp = pd.read_csv(OUTPUT_PATH, sep="\t")
print(len(df_temp))
df_temp.drop_duplicates(inplace=True)
print(len(df_temp))
df_temp.to_csv(OUTPUT_PATH, sep='\t', index=False)

In [None]:
# Load all values from original dataset (INPUT_PATH) and dataset with transcribed sentences (OUTPUT_PATH)
df = pd.read_csv(OUTPUT_PATH, sep="\t")

df_input = pd.read_csv(INPUT_PATH, sep="\t")
df_input.rename(columns={"path": "clip_path"}, inplace=True)

df = df.merge(df_input[['clip_path', 'sentence_source', 'dialect_region',
                        'canton', 'age', 'gender']],
              on='clip_path', how='left')
df.columns

df.head(2)

In [None]:
df.info()
# transcript has few NaN values

In [None]:
# process NaN Values
df['transcript']=df['transcript'].fillna("")

#numerical values, are set to -1 if values can not be negative
df['confidence'] = df['confidence'].fillna(-1)
df.info()

In [None]:
df.describe()

In [None]:
# calculate Word error Rate WER for each sample (sentences are normalized first, removing punctuation, changing everything to lower case)
# https://github.com/analyticsinmotion/werpy/blob/main/README.md 
df['sentence_norm'] = werpy.normalize(df['sentence'])
df['transcript_norm'] = werpy.normalize(df['transcript'])
df['WER'] = werpy.wers(df['sentence_norm'], df['transcript_norm'])
print(f'WER Maximum: {df['WER'].max()}')
df.head(2)


In [None]:
# calculate CER
df['CER'] = df.apply(lambda row: cer(row['sentence_norm'], row['transcript_norm']), axis=1)
print(f"CER Maximum: {df['CER'].max()}")

In [None]:
# calculate Bleu with 1-gram
df['BLEU_1gram'] = df.apply(lambda row: sentence_bleu([row['sentence_norm'].split(' ')], row['transcript_norm'].split(' '), weights=(1, 0, 0, 0), smoothing_function=smooth), axis=1)
print(f"BLEU 1-gram Minimum: {df['BLEU_1gram'].min()}")


In [None]:
# calculate Bleu with 2-gram
df['BLEU_2gram'] = df.apply(lambda row: sentence_bleu([row['sentence_norm'].split(' ')], row['transcript_norm'].split(' '), weights=(0.5, 0.5, 0, 0), smoothing_function=smooth), axis=1)
print(f"BLEU 2-gram Minimum: {df['BLEU_2gram'].min()}")

In [None]:
# calculate Bleu with 2-gram
df['BLEU_3gram'] = df.apply(lambda row: sentence_bleu([row['sentence_norm'].split(' ')], row['transcript_norm'].split(' '), weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth), axis=1)
print(f"BLEU 3-gram Minimum: {df['BLEU_3gram'].min()}")

In [None]:
df['sentence_len'] = df['sentence_norm'].apply(len)
df['transcript_len'] = df['transcript_norm'].apply(len)
df['len_difference'] = df['transcript_len']-df['sentence_len']
df['len_difference_norm'] = df['len_difference']/df['sentence_len']
df['word_count_sent'] = df['sentence_norm'].apply(lambda x: len(str(x).split()))
df['word_count_transc'] = df['transcript_norm'].apply(lambda x: len(str(x).split()))
df['word_count_difference'] = df['word_count_transc']-df['word_count_sent']
df.describe()

In [None]:
# extract audiosize (hint for defect audiofiles)
df['audiosize'] = df['clip_path'].apply(get_file_size)
print(f"min audiosize: {df['audiosize'].min()} Bytes")

In [None]:
audiosize_filter = (df['audiosize']>audiosize_threshold)
print_clips(df[audiosize_filter].sort_values('audiosize', ascending=True), start=0, n_clips=5)

In [None]:
plt.figure(figsize=(10, 15))

bins_audiosize = np.arange(0,100000, 100)
plt.subplot(8, 1, 1, title='audio length in Bytes smallest files')
plt.hist(df['audiosize'], bins=bins_audiosize)
plt.grid()

bins_audiosize = np.arange(0,400000, 1000)
plt.subplot(8, 1, 2, title='audio length in Bytes')
plt.hist(df['audiosize'], bins=bins_audiosize)
plt.grid()

bins_duration = np.arange(0,14, 0.1)
plt.subplot(8, 1, 3, title='Audio-Clip Duration')
plt.hist(df['duration'], bins=bins_duration)
plt.grid()

bins_len = np.arange(0,120, 1)
plt.subplot(8, 1, 4, title=('sentence and transcript length'))
plt.hist(df['sentence_len'], bins = bins_len, label=f'sentence length (min:{df['sentence_len'].min()}, max:{df['sentence_len'].max()})')
plt.hist(df['transcript_len'], bins = bins_len, alpha=0.5, label=f'transcript length (min:{df['transcript_len'].min()}, max:{df['transcript_len'].max()})')
plt.legend()
plt.grid()

bins_diff = np.arange(-0.5, 0.5, 0.01)
plt.subplot(8, 1, 5, title=f'length difference in % to sentence  (min:{df['len_difference_norm'].min()}, max:{df['len_difference_norm'].max()})')
plt.hist(df['len_difference_norm'], bins = bins_diff, alpha=0.5, label='length difference in % to sentence')
plt.grid()

bins_diff = np.arange(-40,40, 1)
plt.subplot(8, 1, 6, title=f'length difference between transcript and sentence  (min:{df['len_difference'].min()}, max:{df['len_difference'].max()})')
plt.hist(df['len_difference'], bins = bins_diff, alpha=0.5, label='length difference')
plt.grid()


bins_words = np.arange(0,20, 1)
plt.subplot(8, 1, 7, title=('sentence and transcript length'))
plt.hist(df['word_count_sent'], bins = bins_words, label=f'sentence word count (min:{df['word_count_sent'].min()}, max:{df['word_count_sent'].max()})')
plt.hist(df['word_count_transc'], bins = bins_words, alpha=0.5, label=f'transcript word count (min:{df['word_count_transc'].min()}, max:{df['word_count_transc'].max()})')
plt.legend()
plt.grid()

bins_words_diff = np.arange(-20,20, 1)
plt.subplot(8, 1, 8, title=f'word count difference between transcript and sentence  (min:{df['word_count_difference'].min()}, max:{df['word_count_difference'].max()})')
plt.hist(df['len_difference'], bins = bins_words_diff, alpha=0.5, label='length difference')
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
# define metrics filters
WER_filter = (df['WER']<WER_threshold)  
CER_filter = (df['CER']<CER_threshold) 
BLEU1_filter = (df['BLEU_1gram']>BLEU1_threshold)  
BLEU2_filter = (df['BLEU_2gram']>BLEU2_threshold)
confidence_filter = (df['confidence']>confidence_threshold)

In [None]:
# analize WER, CER, BLEU, confidence
plt.figure(figsize=(10, 6))

bins_er = np.arange(0,2, 0.02)
plt.subplot(3, 1, 1, title='WER / CER')
plt.hist(df['WER'], bins=bins_er, label='WER')
plt.hist(df['CER'], bins=bins_er, alpha=0.5, label='CER')
plt.grid()
plt.legend()

bins_bleu = np.arange(0,1, 0.01)
plt.subplot(3, 1, 2, title='BLEU')
plt.hist(df['BLEU_1gram'], bins=bins_bleu, label='BLEU 1-gram')
plt.hist(df['BLEU_2gram'], bins=bins_bleu, alpha=0.5, label='BLEU 2-gram')
plt.hist(df['BLEU_3gram'], bins=bins_bleu, alpha=0.3, label='BLEU 3-gram')
plt.grid()
plt.legend()

bins_confidence = np.arange(0,1, 0.02)
plt.subplot(3, 1, 3, title='confidence score')
plt.hist(df['confidence'], bins=bins_confidence)
plt.grid()

plt.tight_layout()
plt.show()

In [None]:
df['CER_filter'] = CER_filter
df['confidence_filter'] = confidence_filter
sns.pairplot(df[df['audiosize'] > 10000], vars=['WER', 'CER', 'confidence'], hue = 'CER_filter')
plt.legend(title=f'CER < {CER_threshold}')

df.drop(['CER_filter', 'confidence_filter'], axis=1, inplace=True)

In [None]:
df['BLEU2_filter'] = BLEU2_filter
sns.pairplot(df[df['audiosize'] > 10000], vars=['WER', 'CER', 'BLEU_1gram'], hue = 'BLEU2_filter')
plt.legend(title=f'BLEU2_filter < {BLEU2_threshold}')

df.drop(['BLEU2_filter'], axis=1, inplace=True)

In [None]:
# analyze WER in Detail
print(f'transcirpts with no differences (WER=0): {len(df[df['WER']==0])} / {len(df)} = {len(df[df['WER']==0])/len(df)*100:.2f} %')
print(f'transcirpts with marginal differences (0<WER<0.25): {len(df[(0<df['WER'])&(df['WER']<0.25)])} / {len(df)} = {len(df[df['WER']==0])/len(df)*100:.2f} %')

plt.figure(figsize=(10,4))
plt.suptitle('WER distribution')
plt.subplot(3, 1, 1)
plt.boxplot(df['WER'], orientation='horizontal')
plt.grid()

plt.subplot(3, 1, 2)
plt.boxplot(df['WER'], orientation='horizontal')
plt.xlim(0, 5)
plt.grid()

plt.subplot(3, 1, 3)
plt.boxplot(df['WER'], orientation='horizontal')
plt.xlim(0, 1)
plt.grid()

plt.tight_layout()

# WER above 4 can be cut, easely, above 2 probably also, and may even above 1

In [None]:
# analyze CER in Detail
print(f'transcirpts with no differences (CER=0): {len(df[df['CER']==0])} / {len(df)} = {len(df[df['CER']==0])/len(df)*100:.2f} %')
print(f'transcirpts with marginal differences (0<CER<0.25): {len(df[(0<df['CER'])&(df['CER']<0.25)])} / {len(df)} = {len(df[df['CER']==0])/len(df)*100:.2f} %')

plt.figure(figsize=(10,4))
plt.suptitle('CER distribution')
plt.subplot(3, 1, 1)
plt.boxplot(df['CER'], orientation='horizontal')
plt.grid()

plt.subplot(3, 1, 2)
plt.boxplot(df['CER'], orientation='horizontal')
plt.xlim(0, 2.5)
plt.grid()

plt.subplot(3, 1, 3)
plt.boxplot(df['CER'], orientation='horizontal')
plt.xlim(0, 1)
plt.grid()

plt.tight_layout()


In [None]:
# analize CER depending on other columns
bins = np.arange(0, 1.2, 0.1)
labels = [f"{i:.1f}-{i+0.1:.1f}" for i in bins[:-1]]

# Liste der Spalten
columns = ['sentence_source', 'age', 'dialect_region', 'canton', 'gender', 'confidence']

# Subplots erstellen (3 Zeilen, 2 Spalten)
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
fig.suptitle('CER Analyse', fontsize=16)

# **Kopie von `df` erstellen**
df_copy = df.copy(deep=True)

# Daten in Bins einteilen (nur in der Kopie)
df_copy['bin'] = pd.cut(df_copy['CER'], bins=bins, labels=labels, right=False)

# Schleife über die Spalten
for i, col in enumerate(columns):
    ax = axes[i // 2, i % 2]

    if col == 'confidence':
        # Float-Spalte in Kategorien umwandeln (nur in der Kopie)
        float_bins = [0, 0.25, 0.5, 0.75, 1.0]
        float_labels = ['0–0.25', '0.25–0.5', '0.5–0.75', '0.75–1.0']
        df_copy[col] = pd.cut(
            df_copy[col].astype(float),
            bins=float_bins,
            labels=float_labels,
            right=False
        ).astype(str)
        df_copy[col] = df_copy[col].fillna('NaN')
    else:
        # Andere Spalten: NaN als String umwandeln (nur in der Kopie)
        df_copy[col] = df_copy[col].fillna('NaN').astype(str)

    # Anzahl der Werte pro Kategorie berechnen (basierend auf der Kopie)
    value_counts = df_copy[col].value_counts(dropna=False).to_dict()

    # Prozentuale Anteile pro Kategorie und Bin berechnen
    result = (
        df_copy.groupby(['bin', col], observed=False)
        .size()
        .unstack(fill_value=0)
        .apply(lambda x: x / x.sum() * 100, axis=0)
    )

    # Plotten
    result.plot(kind='bar', stacked=False, width=0.8, ax=ax)

    # Legenden-Einträge mit Anzahl der Werte ergänzen
    handles, labels = ax.get_legend_handles_labels()
    new_labels = [f"{label} ({value_counts.get(label, 0)})" for label in labels]
    ax.legend(handles, new_labels, title=col)

    ax.set_title(f'CER for {col}')
    ax.set_xlabel('CER')
    ax.set_ylabel('percentage of own category [%]')
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# analize BLEU depending on other columns
bins = np.arange(0, 1, 0.1)
labels = [f"{i:.1f}-{i+0.1:.1f}" for i in bins[:-1]]

# Liste der Spalten
columns = ['sentence_source', 'age', 'dialect_region', 'canton', 'gender', 'confidence']

# Subplots erstellen (3 Zeilen, 2 Spalten)
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
fig.suptitle('BLEU Analyse', fontsize=16)

# **Kopie von `df` erstellen**
df_copy = df.copy(deep=True)

# Daten in Bins einteilen (nur in der Kopie)
df_copy['bin'] = pd.cut(df_copy['BLEU_2gram'], bins=bins, labels=labels, right=False)

# Schleife über die Spalten
for i, col in enumerate(columns):
    ax = axes[i // 2, i % 2]

    if col == 'confidence':
        # Float-Spalte in Kategorien umwandeln (nur in der Kopie)
        float_bins = [0, 0.25, 0.5, 0.75, 1.0]
        float_labels = ['0–0.25', '0.25–0.5', '0.5–0.75', '0.75–1.0']
        df_copy[col] = pd.cut(
            df_copy[col].astype(float),
            bins=float_bins,
            labels=float_labels,
            right=False
        ).astype(str)
        df_copy[col] = df_copy[col].fillna('NaN')
    else:
        # Andere Spalten: NaN als String umwandeln (nur in der Kopie)
        df_copy[col] = df_copy[col].fillna('NaN').astype(str)

    # Anzahl der Werte pro Kategorie berechnen (basierend auf der Kopie)
    value_counts = df_copy[col].value_counts(dropna=False).to_dict()

    # Prozentuale Anteile pro Kategorie und Bin berechnen
    result = (
        df_copy.groupby(['bin', col], observed=False)
        .size()
        .unstack(fill_value=0)
        .apply(lambda x: x / x.sum() * 100, axis=0)
    )

    # Plotten
    result.plot(kind='bar', stacked=False, width=0.8, ax=ax)

    # Legenden-Einträge mit Anzahl der Werte ergänzen
    handles, labels = ax.get_legend_handles_labels()
    new_labels = [f"{label} ({value_counts.get(label, 0)})" for label in labels]
    ax.legend(handles, new_labels, title=col)

    ax.set_title(f'BLEU 2-gram for {col}')
    ax.set_xlabel('BLEU 2-gram')
    ax.set_ylabel('percentage of own category [%]')
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# search for special signs
special_signs_filter = (df['transcript'].apply(is_in_string, certain_string_list=special_signs))

print(f'clips including special signs {special_signs}: {len(df[(special_signs_filter)])}')
print()

print_clips(df[df['transcript_len']>0][(special_signs_filter)].sort_values('CER', ascending=False), start=0, n_clips=5)

In [None]:
# analyze users having several bad audio clips
low_quality_user_filter = (df['clip_path'].apply(is_in_string, certain_string_list=low_quality_users))

print(f'amount of audio clips from low quality users: {len(df[(low_quality_user_filter)])}')
print()

print_clips(df[df['transcript_len']>0][(low_quality_user_filter)].sort_values('BLEU_1gram', ascending=False), start=0, n_clips=5)

In [None]:
# analyze Audio clip qualities depending on age
plt.figure(figsize=(15,4))

# Kategorien für confidence erstellen
bins = [-2, 0, confidence_threshold, float('inf')]
labels = ["NA", f"bad <={confidence_threshold}", f"good >{confidence_threshold}"]
df['confidence_cat'] = pd.cut(
    df['confidence'],
    bins=bins,
    labels=labels,
    right=True
)


# Funktion um auch absolute Counts in der crosstab anzuzeigen
def get_annotations(crosstab, tab_count):
    annot = np.empty_like(crosstab.values, dtype=object)
    for i in range(crosstab.shape[0]):
        for j in range(crosstab.shape[1]):
            value = crosstab.iloc[i, j]
            count = tab_count.iloc[i, j] if j < tab_count.shape[1] else 0
            annot[i, j] = f"{value:.2f}\n({int(count)})"
    return annot

# 1. Plot: CER < threshold
plt.subplot(1, 3, 1, title=f'Percentage of CER < {CER_threshold}')
tab = pd.crosstab(df['age'], (df['CER'] < CER_threshold), normalize='index')
tab_count = pd.crosstab(df['age'], (df['CER'] < CER_threshold))
annot = get_annotations(tab, tab_count)
sns.heatmap(tab, cmap=sns.cubehelix_palette(as_cmap=True),
            annot=annot, fmt='', linewidths=0.5)


# 3. Plot: confidence (drei Kategorien)
plt.subplot(1, 3, 2, title=f'confidence')
tab = pd.crosstab(df['age'], df['confidence_cat'], normalize='index')
tab_count = pd.crosstab(df['age'], df['confidence_cat'])
annot = get_annotations(tab, tab_count)
sns.heatmap(tab, cmap=sns.cubehelix_palette(as_cmap=True),
            annot=annot, fmt='', linewidths=0.5)

# 1. Plot: audiosize < threshold
plt.subplot(1, 3, 3, title=f'Percentage of audiosize < {audiosize_threshold}')
tab = pd.crosstab(df['age'], (df['audiosize'] < audiosize_threshold), normalize='index')
tab_count = pd.crosstab(df['age'], (df['audiosize'] < audiosize_threshold))
annot = get_annotations(tab, tab_count)
sns.heatmap(tab, cmap=sns.cubehelix_palette(as_cmap=True),
            annot=annot, fmt='', linewidths=0.5)

plt.tight_layout()
plt.show()

In [None]:
# analyze common transcripts
# Zähle die Häufigkeit jedes unique Werts in 'transcript_norm'
value_counts = df[df['duration']>duration_threshold]['transcript_norm'].value_counts()

# Filtere nur Werte mit mehr als 5 Vorkommen, Länge > Schwellwert und nicht in 'sentence_norm'
frequent_long_unique_values = value_counts[
    (value_counts > 2) &
    (~value_counts.index.isin(df['sentence_norm'].unique()))
]

# Ausgabe
print(f"Unique Werte in 'transcript_norm' (Häufigkeit > 5, duration > {duration_threshold}, nicht in 'sentence_norm'):")
print(frequent_long_unique_values)


In [None]:
common_transcript_filter = (df['transcript_norm'].apply(is_certain_string, certain_string_list=common_transcripts))

print(f'amount of transcripts having common sentences: {len(df[(common_transcript_filter)])},'
      f'(duration > {duration_threshold}: {len(df[(common_transcript_filter) & (df['duration']>duration_threshold)])})')
print()

print_clips(df[(common_transcript_filter) & (df['duration']>duration_threshold)].sort_values('CER', ascending=True), start=0, n_clips=5)

In [None]:
pairplot_columns = ['WER', 'CER', 'sentence_len', 'transcript_len', 'len_difference', 'audiosize', 'duration', 'confidence', 'dialect_region']

In [None]:
sns.pairplot(df[df['audiosize'] > audiosize_threshold][pairplot_columns], hue='dialect_region')
plt.legend()

In [None]:
# how many samples are filtered by threshold
# WER_filter, CER_filter, audiosize_filter and confidence_filter are already defined earlier           
trlen_low_filter     = (df['transcript_len'] >transc_len_threshold_low     )
trlen_high_filter    = (df['transcript_len'] <transc_len_threshold_high    )
len_diff_low_filter  = (df['len_difference'] >len_difference_threshold_low )
len_diff_high_filter = (df['len_difference'] <len_difference_threshold_high)
len_diff_norm_low_filter  = (df['len_difference_norm'] >len_diff_norm_threshold_low )
len_diff_norm_high_filter = (df['len_difference_norm'] <len_diff_norm_threshold_high)
trcount_low_filter   = (df['word_count_transc'] >transc_count_threshold_low)
duration_filter      = (df['duration']       >duration_threshold           )


# filter data based on findings
df_filter = df[ (WER_filter          )
              & (CER_filter          )
              & (BLEU1_filter        )
              & (BLEU2_filter        )                            
              & (confidence_filter   )             
              & (trlen_low_filter    )
              & (trlen_high_filter   )
              & (len_diff_low_filter )
              & (len_diff_high_filter)
              & (len_diff_norm_low_filter )
              & (len_diff_norm_high_filter)
              & (trcount_low_filter  )             
              & (audiosize_filter    )
              & (duration_filter     )
              & (duration_filter     )
              #& (~low_quality_user_filter ) from 1100 clips per user, its may not a good idea to remove complete user
              & (~common_transcript_filter)
              & (~special_signs_filter)
              ]

filtered_clips = len(df_filter)
removed_clips = len(df) - filtered_clips

print(f'WER Threshold: {WER_threshold}, will remove {len(df[~WER_filter])} samples')
print(f'CER Threshold: {CER_threshold}, will remove {len(df[~CER_filter])} samples')
print(f'BLEU 1-gram Threshold: {BLEU1_threshold}, will remove {len(df[~BLEU1_filter])} samples')
print(f'BLEU 2-gram Threshold: {BLEU2_threshold}, will remove {len(df[~BLEU2_filter])} samples')
print(f'confidence Threshold: {confidence_threshold}, will remove {len(df[~confidence_filter])} samples')
print()
print(f'Transcript length Low Threshold: {transc_len_threshold_low}, will remove {len(df[~trlen_low_filter])} samples')
print(f'Transcript length High Threshold: {transc_len_threshold_high}, will remove {len(df[~trlen_high_filter])} samples')
print()
print(f'len difference Low Threshold: {len_difference_threshold_low}, will remove {len(df[~len_diff_low_filter])} samples')
print(f'len difference High Threshold: {len_difference_threshold_high}, will remove {len(df[~len_diff_high_filter])} samples')
print(f'len difference Norm Low Threshold: { len_diff_norm_threshold_low}, will remove { len(df[~len_diff_norm_low_filter])} samples')
print(f'len difference Norm High Threshold: {len_diff_norm_threshold_high}, will remove {len(df[~len_diff_norm_high_filter])} samples')
print()
print(f'Transcript word count Low Threshold: {transc_count_threshold_low}, will remove {len(df[~trcount_low_filter])} samples')
print()
print(f'Audiosize Threshold: {audiosize_threshold} Bytes, will remove {len(df[~audiosize_filter])} samples')
print(f'Duration Threshold: {duration_threshold} sec, will remove {len(df[~duration_filter])} samples')
print()
print(f'Low Quality users filter will remove {len(df[low_quality_user_filter])} samples')
print()
print(f'Common transcript filter will remove {len(df[common_transcript_filter])} samples')
print(f'Special signs filter will remove {len(df[special_signs_filter])} samples')


print(f'\n\nfiltered df count: {filtered_clips}, removed clips count: {removed_clips}')

In [None]:
# highest WERs in filtered data
print(f'WER_threshold: {WER_threshold}\n***********************************************')
print_clips(df_filter.sort_values('WER', ascending=False), start=0, n_clips=5)

In [None]:
# lowest confidence in filtered data
print(f'confidence_threshold: {confidence_threshold}\n***********************************************')
print_clips(df_filter.sort_values('confidence', ascending=True), start=0, n_clips=5)

In [None]:
# lowest BLEUs in filtered data
print(f'BLEU1_threshold: {BLEU1_threshold}\n***********************************************')
print_clips(df_filter.sort_values('BLEU_1gram', ascending=True), start=0, n_clips=5)

In [None]:
# shortest transcripts in filtered data
print(f'transc_len_threshold_low: {transc_len_threshold_low}\n***********************************************')
print_clips(df_filter.sort_values('transcript_len', ascending=True), start=0, n_clips=5)

In [None]:
# longest transcripts in filtered data
print(f'transc_len_threshold_high: {transc_len_threshold_high}\n***********************************************')
print_clips(df_filter.sort_values('transcript_len', ascending=False), start=0, n_clips=5)

In [None]:
# transcript way shorter than sentence
print(f'len_difference_threshold_low: {len_difference_threshold_low}\n***********************************************')
print_clips(df_filter.sort_values('len_difference', ascending = True), start=0, n_clips=5)

In [None]:
# transcript way longer than sentence
print(f'len_difference_threshold_high: {len_difference_threshold_high}\n***********************************************')
print_clips(df_filter.sort_values('len_difference', ascending = False), start=0, n_clips=5)

In [None]:
# transcript way shorter than sentence
print(f'len_diff_norm_threshold_low: {len_diff_norm_threshold_low}\n***********************************************')
print_clips(df_filter.sort_values('len_difference_norm', ascending = True), start=0, n_clips=5)

In [None]:
# transcript way longer than sentence
print(f'len_diff_norm_threshold_high: {len_diff_norm_threshold_high}\n***********************************************')
print_clips(df_filter.sort_values('len_difference_norm', ascending = False), start=0, n_clips=5)

In [None]:
# short word count transcripts
print(f'transc_count_threshold_low (word count): {transc_count_threshold_low}\n***********************************************')
print_clips(df_filter.sort_values('word_count_transc', ascending = True), start=0, n_clips=5)

In [None]:
# save filtered dataset to tsv
df_filter.to_csv(f'filtered_{INPUT_FILENAME}', sep="\t")