In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import re, os, csv, pathlib
import pandas as pd
from statistics import mean, variance

accent_short_forms = {"hindi":"HIN", "korean":"KOR", "vietnamese":"VTN", "arabic":"ARB", "chinese":"CHN", "spanish":"ESP"}

def replace_with_short_forms(s):
    for key, value in accent_short_forms.items():
        s = s.replace(key, value)
    return s

def last_name(pth):
    return pathlib.PurePath(pth).name

def get_dirs(pth):
    return [last_name(f.name) for f in os.scandir(pth) if f.is_dir()]

def get_each_run(lne):
    return list(map(float, re.findall(': (.+?) -> ', lne)[0].split(' ')))

def get_selection_counts(s):
    return list(map(replace_with_short_forms, re.findall('Counter\\((.+?)\\)', s)))

def get_test_file_from_stats_path(run_number, stats_file_opened):
    return stats_file_opened.name[:-9]+"run_{}/output/test_infer_log.txt".format(run_number)

def WER_test_file(test_file):
    txt_file = open(test_file, 'r')
    lines = txt_file.readlines()
    matched = ""
    for line in lines:
        if "==========>>>>>>Evaluation Greedy WER: " in line:
            txt_file.close()
            return float(line.rstrip().split(": ")[1])
    txt_file.close()
    return ""

def get_eta(func, eta):
    if func != "FL1MI":
        return ""
    else:
        return "-n:"+str(float(eta[4:]))

In [2]:
budget = 100
target = 50
features = '39'

In [3]:
# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/accent/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
cols = ['accent', 'ground', 'function', 'similarity', 'duration', 'samples', 
        'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-var', 'samples-run_1', 'samples-run_2', 'samples-run_3']
df = pd.DataFrame(columns = cols)
accents = [f.name for f in os.scandir('./') if f.is_dir() and f.name != '.ipynb_checkpoints']
cnt = 0
for accent in accents:
    if not(pathlib.Path('./{}/manifests/TSS_output/'.format(accent)).is_dir()):
        continue
    for pick_from in get_dirs('./{}/manifests/TSS_output/'.format(accent)):
        if not(pathlib.Path('./{}/manifests/TSS_output/{}/budget_{}/target_{}/'.format(accent, pick_from, budget, target)).is_dir()):
            continue
        for function in get_dirs('./{}/manifests/TSS_output/{}/budget_{}/target_{}/'.format(accent, pick_from, budget, target)):
            if function == "random":
                stats_file_path = './{}/manifests/TSS_output/{}/budget_{}/target_{}/{}/stats.txt'.format(accent, pick_from, budget, 
                                                                                                                target, function)                                                                                  
                if not(os.path.isfile(stats_file_path)):
                    continue
                stats_file = open(stats_file_path, 'r')
                lines = stats_file.readlines()
                total_selections, total_durations, accented_selections, accented_durations = map(get_each_run, lines[:4])
                sample_frac = mean([x[0]/x[1] for x in zip(accented_selections, total_selections)])
                sample_total = mean(total_selections)
                duration_frac = mean([x[0]/x[1] for x in zip(accented_durations, total_durations)])
                duration_total = mean(total_durations)
                df_duration = "{:.2f}/{:.2f}".format(duration_total*duration_frac, duration_total)
                df_samples = "{:.2f}/{:.2f}".format(sample_total*sample_frac, sample_total)
                df_selections = get_selection_counts(lines[5])
                wers = [WER_test_file(get_test_file_from_stats_path(i, stats_file)) for i in range(1,4)]
                df_wer_mean = round(mean(wers), 2)
                df_wer_var = round(variance(wers), 4)
                df = df.append(dict(zip(cols, [accent, pick_from, function, "NA", df_duration, df_samples]+wers
                                        +[df_wer_mean, df_wer_var] + df_selections)), ignore_index=True)
                stats_file.close()
                continue
            for eta in get_dirs('./{}/manifests/TSS_output/{}/budget_{}/target_{}/{}/'.format(accent, pick_from, budget, target, function)):
                for similarity in get_dirs('./{}/manifests/TSS_output/{}/budget_{}/target_{}/{}/{}/'.format(accent, pick_from, 
                                                                                                        budget, target, function, eta)):
                    # print(cnt)
                    cnt += 1
                    stats_file_path = './{}/manifests/TSS_output/{}/budget_{}/target_{}/{}/{}/{}/{}/stats.txt'.format(accent, pick_from, budget, 
                                                                                                                target, function, eta, similarity, 
                                                                                                                features)                                                                                  
                    if not(os.path.isfile(stats_file_path)):
                        continue
                    stats_file = open(stats_file_path, 'r')
                    lines = stats_file.readlines()
                    total_selections, total_durations, accented_selections, accented_durations = map(get_each_run, lines[:4])
                    sample_frac = mean([x[0]/x[1] for x in zip(accented_selections, total_selections)])
                    sample_total = mean(total_selections)
                    duration_frac = mean([x[0]/x[1] for x in zip(accented_durations, total_durations)])
                    duration_total = mean(total_durations)
                    df_duration = "{:.2f}/{:.2f}".format(duration_total*duration_frac, duration_total)
                    df_samples = "{:.2f}/{:.2f}".format(sample_total*sample_frac, sample_total)
                    df_selections = get_selection_counts(lines[5])
                    wers = [WER_test_file(get_test_file_from_stats_path(i, stats_file)) for i in range(1,4)]
                    df_wer_mean = round(mean(wers), 2)
                    df_wer_var = round(variance(wers), 4)
                    df = df.append(dict(zip(cols, [accent, pick_from, function+get_eta(function, eta), similarity, df_duration, df_samples]+wers
                                            +[df_wer_mean, df_wer_var] + df_selections)), ignore_index=True)
                    stats_file.close()

In [4]:
df = df.sort_values(by=['accent', 'similarity', 'ground', 'function'], ascending=True, ignore_index=True)
display(df)

Unnamed: 0,accent,ground,function,similarity,duration,samples,WER-r1,WER-r2,WER-r3,WER-mean,WER-var,samples-run_1,samples-run_2,samples-run_3
0,arabic,all,random,,71.24/358.35,19.39/97.67,23.02,23.07,23.59,23.23,0.0996,"{'VTN': 22, 'CHN': 18, 'ESP': 18, 'HIN': 14, '...","{'ARB': 24, 'KOR': 20, 'HIN': 17, 'ESP': 16, '...","{'ARB': 20, 'HIN': 17, 'VTN': 16, 'CHN': 16, '..."
1,arabic,all,FL1MI-n:1.0,euclidean,262.06/357.29,61.00/83.00,22.39,22.6,22.47,22.49,0.0112,"{'ARB': 61, 'CHN': 7, 'KOR': 6, 'VTN': 4, 'HIN...","{'ARB': 61, 'CHN': 7, 'KOR': 6, 'VTN': 4, 'HIN...","{'ARB': 61, 'CHN': 7, 'KOR': 6, 'VTN': 4, 'HIN..."
2,arabic,all,FL2MI,euclidean,357.89/357.89,83.00/83.00,22.38,22.08,22.08,22.18,0.03,{'ARB': 83},{'ARB': 83},{'ARB': 83}
3,arabic,all,GCMI,euclidean,291.69/356.94,73.00/88.00,22.21,22.21,22.21,22.21,0.0,"{'ARB': 73, 'KOR': 14, 'CHN': 1}","{'ARB': 73, 'KOR': 14, 'CHN': 1}","{'ARB': 73, 'KOR': 14, 'CHN': 1}"
4,arabic,all,LogDMI,euclidean,359.61/359.61,88.00/88.00,21.72,21.74,21.59,21.68,0.0066,{'ARB': 88},{'ARB': 88},{'ARB': 88}
5,chinese,all,random,,53.65/358.35,14.27/97.67,27.87,27.87,27.7,27.81,0.0096,"{'VTN': 22, 'CHN': 18, 'ESP': 18, 'HIN': 14, '...","{'ARB': 24, 'KOR': 20, 'HIN': 17, 'ESP': 16, '...","{'ARB': 20, 'HIN': 17, 'VTN': 16, 'CHN': 16, '..."
6,chinese,all,FL1MI-n:1.0,euclidean,275.69/355.79,62.00/83.00,26.35,26.31,26.3,26.32,0.0007,"{'CHN': 62, 'KOR': 7, 'ESP': 5, 'VTN': 4, 'HIN...","{'CHN': 62, 'KOR': 7, 'ESP': 5, 'VTN': 4, 'HIN...","{'CHN': 62, 'KOR': 7, 'ESP': 5, 'VTN': 4, 'HIN..."
7,chinese,all,FL2MI,euclidean,358.63/358.63,81.00/81.00,26.31,26.41,26.33,26.35,0.0028,{'CHN': 81},{'CHN': 81},{'CHN': 81}
8,chinese,all,GCMI,euclidean,316.96/357.21,70.00/81.00,26.79,26.67,26.68,26.71,0.0044,"{'CHN': 70, 'ESP': 9, 'ARB': 2}","{'CHN': 70, 'ESP': 9, 'ARB': 2}","{'CHN': 70, 'ESP': 9, 'ARB': 2}"
9,chinese,all,LogDMI,euclidean,352.55/358.06,86.00/87.00,26.57,26.77,26.99,26.78,0.0441,"{'CHN': 86, 'KOR': 1}","{'CHN': 86, 'KOR': 1}","{'CHN': 86, 'KOR': 1}"


In [5]:
# total selection : 100 100 100 -> 100.00
# total selection duration: 357.0149433106577 357.0149433106577 357.0149433106577 -> 357.01
# accented selection: 76 76 76 -> 76.00
# accented duration: 254.74947845804974 254.74947845804974 254.74947845804974 -> 254.75

# all selections: [Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3}), Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3}), Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3})]

#Evaluation Greedy WER: 16.19

df.to_csv("report_{}_{}_{}.csv".format(budget, target, features), index=False)