In [8]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
import re, os, csv, pathlib, ast
import pandas as pd
from statistics import mean, variance

def replace_with_short_forms(s):
    s=s.replace('english','')
    s=s.replace('_female_','[F]')
    s=s.replace('_male_','[M]')
    return s

def shortened_speaker(s):
    accent,gender,_=s.split('_')
    return accent[0].upper()+accent[1:]+'['+gender[0].upper()+']'

def last_name(pth):
    return pathlib.PurePath(pth).name

def get_dirs(pth):
    return [last_name(f.name) for f in os.scandir(pth) if f.is_dir()]

def get_each_run(lne):
    return list(map(float, re.findall(': (.+?) -> ', lne)[0].split(' ')))

def get_selection_counts(s):
    return list(map(replace_with_short_forms, re.findall('Counter\\((.+?)\\)', s)))

def get_test(stats_file_path):
    return stats_file_path[:-9]+"run_1/output/test_infer_log.txt"

def get_test_file_from_stats_path(run_number, stats_file_opened):
    return stats_file_opened.name[:-9]+"run_{}/output/test_infer_log.txt".format(run_number)

def WER_test_file(test_file):
    txt_file = open(test_file, 'r')
    lines = txt_file.readlines()
    matched = ""
    for line in lines:
        if "==========>>>>>>Evaluation Greedy WER: " in line:
            txt_file.close()
            return float(line.rstrip().split(": ")[1])
    txt_file.close()
    return ""

def get_eta(func, eta):
    return "-n:"+str(float(eta[4:]))

In [6]:
# budget = 100
# target = 50
budget = 100
target = 10
features = '39'
# features = 'TRILL'
csv_name = "report_{}_{}_{}.csv".format(budget, target, features)

In [8]:
# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/speaker/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# CMU_expts/speaker_without/ABA/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/run_1/
# budget = 200
# target = 80

cols = ['speaker', 'ground', 'function', 'similarity', 'duration', 'samples', 
        'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-stdev', 'speakers_run1', 'speakers_run2', 'speakers_run3']
df = pd.DataFrame(columns = cols)

speakers = [f.name for f in os.scandir('./') if f.is_dir() and 'english' in f.name]

# not random
for speaker in speakers:
    if not(pathlib.Path('./{}/manifests/TSS_output/'.format(speaker)).is_dir()):
        print("no results for speaker {}".format(speaker))
        continue
    if 'within' not in get_dirs('./{}/manifests/TSS_output/'.format(speaker)):
        print("no within results for {}".format(speaker))
        continue
    if not(os.path.isdir('./{}/manifests/TSS_output/within/budget_{}/target_{}/'.format(speaker, budget, target))):
        continue
    for function in get_dirs('./{}/manifests/TSS_output/within/budget_{}/target_{}/'.format(speaker, budget, target)):
        if 'deprecated' in function:
            continue
        stats_file_path='./{}/manifests/TSS_output/within/budget_{}/target_{}/{}/stats.txt'.format(speaker, budget, target, function)
        if not(os.path.isfile(stats_file_path)):
            # print(cnt,"99999999999")
            print(stats_file_path+" not found")
            continue
        if not(os.path.isfile(get_test(stats_file_path))):
            print(get_test(stats_file_path)+" not found")
            continue
        stats_file = open(stats_file_path, 'r')
        lines = stats_file.readlines()
        df_selections = get_selection_counts(lines[5])
        total_selections, total_durations, speakered_selections, speakered_durations = map(get_each_run, lines[:4])
        sample_frac = mean([x[0]/x[1] for x in zip(speakered_selections, total_selections)])
        sample_total = mean(total_selections)
        duration_frac = mean([x[0]/x[1] for x in zip(speakered_durations, total_durations)])
        duration_total = mean(total_durations)
        df_duration = "{:.2f}/{:.2f}".format(duration_total*duration_frac, duration_total)
        df_samples = "{:.2f}/{:.2f}".format(sample_total*sample_frac, sample_total)
        try:
            wers = [WER_test_file(get_test_file_from_stats_path(i, stats_file)) for i in range(1,4)]
            wers = [x for x in wers if type(x)==float or type(x)==int]
            df_wer_mean = round(mean(wers), 2)
            df_wer_stdev = round(variance(wers), 3)**0.5
        except:
            print("no WER's in file", get_test_file_from_stats_path(1, stats_file))
            wers = [0,0,0]
            df_wer_mean = 0
            df_wer_stdev = 999
        df = df.append(dict(zip(cols, [shortened_speaker(speaker), "within", function, "NA", df_duration, df_samples]+
                                       wers+[df_wer_mean, round(df_wer_stdev, 3)] + df_selections)), 
                       ignore_index=True)
        stats_file.close()

df = df.sort_values(by=['speaker', 'similarity', 'ground', 'function'], ascending=True, ignore_index=True)
display(df)

Unnamed: 0,speaker,ground,function,similarity,duration,samples,WER-r1,WER-r2,WER-r3,WER-mean,WER-stdev,speakers_run1,speakers_run2,speakers_run3
0,Assamese[F],within,random,,489.78/489.78,97.00/97.00,20.16,19.64,19.45,19.75,0.367,{'assamese[F]': 99},{'assamese[F]': 98},{'assamese[F]': 94}
1,Gujarati[F],within,random,,485.51/485.51,65.00/65.00,10.26,9.93,10.31,10.17,0.207,{'gujarati[F]': 61},{'gujarati[F]': 66},{'gujarati[F]': 68}
2,Hindi[M],within,random,,490.24/490.24,102.33/102.33,8.39,9.2,9.09,8.89,0.439,{'hindi[M]': 99},{'hindi[M]': 104},{'hindi[M]': 104}
3,Kannada[M],within,random,,487.74/487.74,91.33/91.33,14.58,13.88,14.13,14.2,0.355,{'kannada[M]': 90},{'kannada[M]': 94},{'kannada[M]': 90}
4,Malayalam[M],within,random,,490.80/490.80,91.00/91.00,15.72,16.12,15.45,15.76,0.338,{'malayalam[M]': 92},{'malayalam[M]': 91},{'malayalam[M]': 90}
5,Manipuri[F],within,random,,490.30/490.30,132.33/132.33,42.09,42.46,41.9,42.15,0.285,{'manipuri[F]': 129},{'manipuri[F]': 133},{'manipuri[F]': 135}
6,Rajasthani[M],within,random,,487.15/487.15,91.67/91.67,15.58,16.06,15.76,15.8,0.243,{'rajasthani[M]': 93},{'rajasthani[M]': 92},{'rajasthani[M]': 90}
7,Tamil[M],within,random,,488.66/488.66,96.33/96.33,11.76,11.93,12.07,11.92,0.155,{'tamil[M]': 91},{'tamil[M]': 99},{'tamil[M]': 99}


In [9]:
# total selection : 100 100 100 -> 100.00
# total selection duration: 357.0149433106577 357.0149433106577 357.0149433106577 -> 357.01
# speakered selection: 76 76 76 -> 76.00
# speakered duration: 254.74947845804974 254.74947845804974 254.74947845804974 -> 254.75

# all selections: [Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3}), Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3}), Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3})]

#Evaluation Greedy WER: 16.19

df2 = pd.read_csv(csv_name)
df2=df2.append(df)
df2 = df2.sort_values(by=['speaker', 'ground', 'similarity', 'function'], ascending=True, ignore_index=True)
display(df2)

Unnamed: 0,speaker,ground,function,similarity,duration,samples,WER-r1,WER-r2,WER-r3,WER-mean,WER-var,speakers_run1,speakers_run2,speakers_run3,WER-stdev
0,Assamese[F],all,FL1MI-n:1.0,euclidean,460.04/491.51,86.00/93.00,20.19,19.99,20.18,20.12,0.013,"{'assamese[F]': 86, 'manipuri[F]': 4, 'gujarat...","{'assamese[F]': 86, 'manipuri[F]': 4, 'gujarat...","{'assamese[F]': 86, 'manipuri[F]': 4, 'gujarat...",
1,Assamese[F],all,FL2MI-n:1.0,euclidean,487.83/487.83,80.00/80.00,20.28,20.23,20.28,20.26,0.001,{'assamese[F]': 80},{'assamese[F]': 80},{'assamese[F]': 80},
2,Assamese[F],all,GCMI-n:1.0,euclidean,490.36/490.36,81.00/81.00,20.5,20.38,20.31,20.4,0.009,{'assamese[F]': 81},{'assamese[F]': 81},{'assamese[F]': 81},
3,Assamese[F],all,LogDMI-n:1.0,euclidean,477.89/484.48,90.00/92.00,19.95,19.86,19.87,19.89,0.002,"{'assamese[F]': 90, 'manipuri[F]': 2}","{'assamese[F]': 90, 'manipuri[F]': 2}","{'assamese[F]': 90, 'manipuri[F]': 2}",
4,Assamese[F],all,random,,81.27/490.10,17.52/99.67,22.56,23.55,23.53,23.21,0.32,"{'assamese[F]': 19, 'tamil[M]': 19, 'manipuri[...","{'manipuri[F]': 22, 'assamese[F]': 17, 'tamil[...","{'tamil[M]': 22, 'manipuri[F]': 17, 'assamese[...",
5,Assamese[F],within,random,,489.78/489.78,97.00/97.00,20.16,19.64,19.45,19.75,,{'assamese[F]': 99},{'assamese[F]': 98},{'assamese[F]': 94},0.367
6,Gujarati[F],all,FL1MI-n:1.0,euclidean,426.61/491.36,51.00/65.00,10.22,10.15,10.16,10.18,0.001,"{'gujarati[F]': 51, 'assamese[F]': 12, 'kannad...","{'gujarati[F]': 51, 'assamese[F]': 12, 'kannad...","{'gujarati[F]': 51, 'assamese[F]': 12, 'kannad...",
7,Gujarati[F],all,FL2MI-n:1.0,euclidean,490.55/490.55,61.00/61.00,10.0,9.93,10.0,9.98,0.002,{'gujarati[F]': 61},{'gujarati[F]': 61},{'gujarati[F]': 61},
8,Gujarati[F],all,GCMI-n:1.0,euclidean,488.14/488.14,55.00/55.00,10.25,10.23,10.73,10.4,0.08,{'gujarati[F]': 55},{'gujarati[F]': 55},{'gujarati[F]': 55},
9,Gujarati[F],all,LogDMI-n:1.0,euclidean,487.59/487.59,65.00/65.00,9.83,9.85,9.82,9.83,0.0,{'gujarati[F]': 65},{'gujarati[F]': 65},{'gujarati[F]': 65},


In [14]:
df2.to_csv(csv_name, index=False)