In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import re, os, csv, pathlib, ast
import pandas as pd
from statistics import mean, variance

In [4]:
accent_short_forms = {"hindi":"HIN", "korean":"KOR", "vietnamese":"VTN", "arabic":"ARB", "chinese":"CHN", "spanish":"ESP"}
accent_map = {"ABA":"arabic","SKA":"arabic","YBAA":"arabic","ZHAA":"arabic",
              "BWC":"chinese","LXC":"chinese","NCC":"chinese","TXHC":"chinese",
              "ASI":"hindi","RRBI":"hindi","SVBI":"hindi","TNI":"hindi",
              "HJK":"korean","HKK":"korean","YDCK":"korean","YKWK":"korean",
              "EBVS":"spanish","ERMS":"spanish","MBMPS":"spanish","NJS":"spanish",
              "HQTV":"vietnamese","PNV":"vietnamese","THV":"vietnamese","TLV":"vietnamese"
              }
raw_string="""|ABA|M|Arabic|1129|150|\n|SKA|F|Arabic|974|150|\n|YBAA|M|Arabic|1130|149|\n|ZHAA|F|Arabic|1132|150|\n|BWC|M|Chinese|1130|150|\n|LXC|F|Chinese|1131|150|\n|NCC|F|Chinese|1131|150|\n|TXHC|M|Chinese|1132|150|\n|ASI|M|Hindi|1131|150|\n|RRBI|M|Hindi|1130|150|\n|SVBI|F|Hindi|1132|150|\n|TNI|F|Hindi|1131|150|\n|HJK|F|Korean|1131|150|\n|HKK|M|Korean|1131|150|\n|YDCK|F|Korean|1131|150|\n|YKWK|M|Korean|1131|150|\n|EBVS|M|Spanish|1007|150|\n|ERMS|M|Spanish|1132|150|\n|MBMPS|F|Spanish|1132|150|\n|NJS|F|Spanish|1131|150|\n|HQTV|M|Vietnamese|1132|150|\n|PNV|F|Vietnamese|1132|150|\n|THV|F|Vietnamese|1132|150|\n|TLV|M|Vietnamese|1132|150|"""
raw_strings=raw_string.split('\n')
gender_map={}
for lne in raw_strings:
    attrs=lne.split('|')
    gender_map[attrs[1]]=attrs[2]

composed_accent_map = {k: accent_short_forms.get(v) for k, v in accent_map.items()}

def replace_with_short_forms(s):
    for key, value in accent_short_forms.items():
        s = s.replace(key, value)
    return s

def group_speakers(s):
    ret = {}
    speaker_counts = ast.literal_eval(s)
    for speaker, count in speaker_counts.items():
        accent = composed_accent_map[speaker]
        if accent not in ret:
            ret[accent] = {}
        ret[accent][speaker] = count
    return str(ret)

def group_accents(s):
    # print(s)
    ret = {}
    speaker_counts = ast.literal_eval(s)
    for speaker, count in speaker_counts.items():
        accent = composed_accent_map[speaker]
        if accent not in ret:
            ret[accent] = {}
        ret[accent][speaker] = count
    # print(ret)
    accent_counts={}
    for accent in ret:
        cnt=0
        for speaker in ret[accent]:
            cnt+=ret[accent][speaker]
        accent_counts[accent]=cnt
    # print(accent_counts)
    return str(accent_counts)

def last_name(pth):
    return pathlib.PurePath(pth).name

def get_dirs(pth):
    return [last_name(f.name) for f in os.scandir(pth) if f.is_dir()]

def get_each_run(lne):
    return list(map(float, re.findall(': (.+?) -> ', lne)[0].split(' ')))

def get_selection_counts(s):
    return list(map(group_accents, re.findall('Counter\\((.+?)\\)', s)))+list(map(group_speakers, re.findall('Counter\\((.+?)\\)', s)))

def get_test_file_from_stats_path(run_number, stats_file_opened):
    return stats_file_opened.name[:-9]+"run_{}/output/test_infer_log.txt".format(run_number)

def WER_test_file(test_file):
    txt_file = open(test_file, 'r')
    lines = txt_file.readlines()
    matched = ""
    for line in lines:
        if "==========>>>>>>Evaluation Greedy WER: " in line:
            txt_file.close()
            return float(line.rstrip().split(": ")[1])
    txt_file.close()
    return ""

def get_eta(func, eta):
    return "-n:"+str(float(eta[4:]))

def clean_cond(fun):
    if 'conditional' not in fun:
        return fun
    _,f,acc=fun.split('_')
    acc=accent_short_forms[acc]
    f=f.split('MI')[0]
    return "{}_cond_against_{}".format(f,acc)

In [11]:
budget = 200
# target = 50
target = 10
features = 'w2v2'
# features = 'w2v2'
# features = "w2v2_layer-10"
# features = "w2v2_layer-5"
csv_name = "report_{}_{}_{}.csv".format(budget, target, features)

In [12]:
# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/accent/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# CMU_expts/speaker_without/ABA/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/run_1/


cols = ['speaker', 'ground', 'function', 'similarity', 'duration', 'samples', 
        'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-var', 'accents_run1', 'accents_run2', 'accents_run3', 'speakers_run1', 'speakers_run2', 'speakers_run3']
df = pd.DataFrame(columns = cols)

speakers = [f.name for f in os.scandir('./') if f.is_dir() and f.name != '.ipynb_checkpoints' and f.name != 'reserved_TSS_output' and f.name in accent_map]

# not random

for speaker in speakers:
    if speaker not in ['ABA', 'ASI', 'BWC', 'HJK', 'NJS', 'PNV', 'EBVS']:
        continue
    if not(pathlib.Path('./{}/manifests/TSS_output/'.format(speaker)).is_dir()):
        print("no results for speaker {}".format(speaker))
        continue
    if 'all' not in get_dirs('./{}/manifests/TSS_output/'.format(speaker)):
        print("no all results for {}".format(speaker))
        continue
    if not(pathlib.Path('./{}/manifests/TSS_output/all/budget_{}/target_{}/'.format(speaker, budget, target)).is_dir()):
        print("no results for budget {} and target {}".format(budget, target))
        continue
    for function in get_dirs('./{}/manifests/TSS_output/all/budget_{}/target_{}/'.format(speaker, budget, target)):
        if function == "random" or function == "FL1MI":
            continue
        for eta in get_dirs('./{}/manifests/TSS_output/all/budget_{}/target_{}/{}/'.format(speaker, budget, target, function)):
            for similarity in get_dirs('./{}/manifests/TSS_output/all/budget_{}/target_{}/{}/{}/'.format(speaker, budget, target, function, eta)):
                stats_file_path = './{}/manifests/TSS_output/all/budget_{}/target_{}/{}/{}/{}/{}/stats.txt'.format(speaker, budget, 
                                                                                                            target, function, eta, similarity, 
                                                                                                            features)  
                try:
                    stats_file = open(stats_file_path, 'r')
                except:
                    print(stats_file_path)
                    continue
                lines = stats_file.readlines()
                df_selections = get_selection_counts(lines[5])
                total_selections, total_durations, speakered_selections, speakered_durations = map(get_each_run, lines[:4])
                sample_frac = mean([x[0]/x[1] for x in zip(speakered_selections, total_selections)])
                sample_total = mean(total_selections)
                duration_frac = mean([x[0]/x[1] for x in zip(speakered_durations, total_durations)])
                duration_total = mean(total_durations)
                df_duration = "{:.2f}/{:.2f}".format(duration_total*duration_frac, duration_total)
                df_samples = "{:.2f}/{:.2f}".format(sample_total*sample_frac, sample_total)
                try:
                    wers = [WER_test_file(get_test_file_from_stats_path(i, stats_file)) for i in range(1,4)]
                    df_wer_mean = round(mean(wers), 2)
                    df_wer_var = round(variance(wers), 3)
                except:
                    print(stats_file_path)
                    continue
                speaker_new=speaker+"[{}/{}]".format(composed_accent_map[speaker], gender_map[speaker])
                df = df.append(dict(zip(cols, [speaker_new, "all", clean_cond(function)+get_eta(function, eta), similarity, df_duration, df_samples]+
                                               wers+[df_wer_mean, round(df_wer_var**0.5, 3)] + df_selections)), 
                               ignore_index=True)
                stats_file.close()

#random
# CMU_expts/speaker_without/ABA/manifests/TSS_output/all/budget_100/target_50/random/run_1/output
for speaker in speakers:
    if speaker not in ['ABA', 'ASI', 'BWC', 'HJK', 'NJS', 'PNV', 'EBVS']:
        continue
    if not(pathlib.Path('./{}/manifests/TSS_output/'.format(speaker)).is_dir()):
        print("no results for speaker {}".format(speaker))
        continue
    if 'all' not in get_dirs('./{}/manifests/TSS_output/'.format(speaker)):
        print("no all results for {}".format(speaker))
        continue
    if not(pathlib.Path('./{}/manifests/TSS_output/all/budget_{}/target_{}/'.format(speaker, budget, target)).is_dir()):
        print("no random results for {} budget {} and target {}".format(speaker, budget, target))
        continue
    if "random" in get_dirs('./{}/manifests/TSS_output/all/budget_{}/target_{}/'.format(speaker, budget, target)):
        stats_file_path = './{}/manifests/TSS_output/all/budget_{}/target_{}/{}/stats.txt'.format(speaker, budget, target, "random")
        try:
            stats_file = open(stats_file_path, 'r')
        except:
            print(stats_file_path)
            continue
#         stats_file = open(stats_file_path, 'r')
        lines = stats_file.readlines()
        df_selections = get_selection_counts(lines[5])
        total_selections, total_durations, speakered_selections, speakered_durations = map(get_each_run, lines[:4])
        sample_frac = mean([x[0]/x[1] for x in zip(speakered_selections, total_selections)])
        sample_total = mean(total_selections)
        duration_frac = mean([x[0]/x[1] for x in zip(speakered_durations, total_durations)])
        duration_total = mean(total_durations)
        df_duration = "{:.2f}/{:.2f}".format(duration_total*duration_frac, duration_total)
        df_samples = "{:.2f}/{:.2f}".format(sample_total*sample_frac, sample_total)
#               print(wers)
        try:
            wers = [WER_test_file(get_test_file_from_stats_path(i, stats_file)) for i in range(1,4)]
            # wers = [x for x in wers if type(x)==float or type(x)==int]
            df_wer_mean = round(mean(wers), 2)
            df_wer_var = round(variance(wers), 3)
            # if speaker=='TNI':
            #     print(df_wer_mean)
        except:
            print("no WER's in file", get_test_file_from_stats_path(1, stats_file))
            wers = [0,0,0]
            df_wer_mean = 0
            df_wer_var = 999
        speaker_new=speaker+"[{}/{}]".format(composed_accent_map[speaker], gender_map[speaker])
        df = df.append(dict(zip(cols, [speaker_new, "all", "random", "NA", df_duration, df_samples]+wers
                                        +[df_wer_mean, round(df_wer_var**0.5, 3)] + df_selections)), ignore_index=True)
        # if speaker=="TNI":
        #     print([speaker_new, "all", function, "NA", df_duration, df_samples]+wers+[df_wer_mean, df_wer_var])
        stats_file.close()

In [13]:
df = df.sort_values(by=['speaker', 'similarity', 'ground', 'function'], ascending=True, ignore_index=True)
display(df)

Unnamed: 0,speaker,ground,function,similarity,duration,samples,WER-r1,WER-r2,WER-r3,WER-mean,WER-var,accents_run1,accents_run2,accents_run3,speakers_run1,speakers_run2,speakers_run3
0,ABA[ARB/M],all,random,,0.00/718.02,0.00/203.00,23.03,23.65,23.03,23.24,0.358,"{'ARB': 38, 'VTN': 33, 'ESP': 38, 'HIN': 36, '...","{'ARB': 33, 'CHN': 41, 'ESP': 31, 'KOR': 34, '...","{'ARB': 37, 'HIN': 34, 'KOR': 37, 'ESP': 37, '...","{'ARB': {'ZHAA': 16, 'YBAA': 12, 'SKA': 10}, '...","{'ARB': {'ZHAA': 17, 'YBAA': 8, 'SKA': 8}, 'CH...","{'ARB': {'YBAA': 14, 'SKA': 13, 'ZHAA': 10}, '..."
1,ABA[ARB/M],all,FL2MI-n:1.0,euclidean,222.49/714.72,61.00/189.00,23.98,23.98,23.68,23.88,0.173,"{'KOR': 61, 'ARB': 61, 'ESP': 34, 'VTN': 22, '...","{'KOR': 61, 'ARB': 61, 'ESP': 34, 'VTN': 22, '...","{'KOR': 61, 'ARB': 61, 'ESP': 34, 'VTN': 22, '...","{'KOR': {'YDCK': 45, 'HJK': 15, 'YKWK': 1}, 'A...","{'KOR': {'YDCK': 45, 'HJK': 15, 'YKWK': 1}, 'A...","{'KOR': {'YDCK': 45, 'HJK': 15, 'YKWK': 1}, 'A..."
2,ABA[ARB/M],all,GCMI-n:1.0,euclidean,410.35/717.36,111.00/189.00,22.04,22.19,22.15,22.13,0.077,"{'ARB': 111, 'ESP': 38, 'KOR': 20, 'VTN': 9, '...","{'ARB': 111, 'ESP': 38, 'KOR': 20, 'VTN': 9, '...","{'ARB': 111, 'ESP': 38, 'KOR': 20, 'VTN': 9, '...","{'ARB': {'SKA': 46, 'YBAA': 40, 'ZHAA': 25}, '...","{'ARB': {'SKA': 46, 'YBAA': 40, 'ZHAA': 25}, '...","{'ARB': {'SKA': 46, 'YBAA': 40, 'ZHAA': 25}, '..."
3,ABA[ARB/M],all,LogDMI-n:1.0,euclidean,271.37/718.82,76.00/189.00,22.95,23.03,23.06,23.01,0.055,"{'ARB': 76, 'ESP': 35, 'VTN': 24, 'KOR': 23, '...","{'ARB': 76, 'ESP': 35, 'VTN': 24, 'KOR': 23, '...","{'ARB': 76, 'ESP': 35, 'VTN': 24, 'KOR': 23, '...","{'ARB': {'YBAA': 42, 'SKA': 22, 'ZHAA': 12}, '...","{'ARB': {'YBAA': 42, 'SKA': 22, 'ZHAA': 12}, '...","{'ARB': {'YBAA': 42, 'SKA': 22, 'ZHAA': 12}, '..."
4,ASI[HIN/M],all,random,,0.00/717.63,0.00/198.33,14.19,13.15,14.73,14.02,0.803,"{'KOR': 43, 'ESP': 37, 'CHN': 34, 'HIN': 27, '...","{'KOR': 31, 'ARB': 36, 'CHN': 41, 'HIN': 30, '...","{'CHN': 35, 'ARB': 36, 'VTN': 35, 'ESP': 34, '...","{'KOR': {'YDCK': 16, 'HJK': 11, 'HKK': 10, 'YK...","{'KOR': {'YDCK': 15, 'HJK': 6, 'HKK': 5, 'YKWK...","{'CHN': {'TXHC': 13, 'NCC': 10, 'BWC': 7, 'LXC..."
5,ASI[HIN/M],all,FL2MI-n:1.0,euclidean,713.24/717.92,197.00/199.00,13.23,13.62,13.73,13.53,0.263,"{'HIN': 197, 'KOR': 2}","{'HIN': 197, 'KOR': 2}","{'HIN': 197, 'KOR': 2}","{'HIN': {'RRBI': 173, 'SVBI': 19, 'TNI': 5}, '...","{'HIN': {'RRBI': 173, 'SVBI': 19, 'TNI': 5}, '...","{'HIN': {'RRBI': 173, 'SVBI': 19, 'TNI': 5}, '..."
6,ASI[HIN/M],all,GCMI-n:1.0,euclidean,713.25/718.16,195.00/196.00,14.37,12.97,13.73,13.69,0.701,"{'HIN': 195, 'VTN': 1}","{'HIN': 195, 'VTN': 1}","{'HIN': 195, 'VTN': 1}","{'HIN': {'RRBI': 168, 'TNI': 17, 'SVBI': 10}, ...","{'HIN': {'RRBI': 168, 'TNI': 17, 'SVBI': 10}, ...","{'HIN': {'RRBI': 168, 'TNI': 17, 'SVBI': 10}, ..."
7,ASI[HIN/M],all,LogDMI-n:1.0,euclidean,605.96/719.89,199.00/242.00,13.19,12.83,13.69,13.24,0.432,"{'HIN': 199, 'ESP': 8, 'VTN': 12, 'KOR': 8, 'A...","{'HIN': 199, 'ESP': 8, 'VTN': 12, 'KOR': 8, 'A...","{'HIN': 199, 'ESP': 8, 'VTN': 12, 'KOR': 8, 'A...","{'HIN': {'RRBI': 101, 'TNI': 57, 'SVBI': 41}, ...","{'HIN': {'RRBI': 101, 'TNI': 57, 'SVBI': 41}, ...","{'HIN': {'RRBI': 101, 'TNI': 57, 'SVBI': 41}, ..."
8,BWC[CHN/M],all,random,,0.00/718.93,0.00/196.00,30.55,30.89,33.13,31.52,1.402,"{'KOR': 41, 'CHN': 34, 'ESP': 36, 'ARB': 28, '...","{'CHN': 41, 'KOR': 31, 'ARB': 35, 'HIN': 30, '...","{'CHN': 35, 'ARB': 35, 'VTN': 35, 'HIN': 25, '...","{'KOR': {'YDCK': 16, 'HJK': 10, 'HKK': 9, 'YKW...","{'CHN': {'LXC': 22, 'TXHC': 14, 'NCC': 5}, 'KO...","{'CHN': {'TXHC': 16, 'NCC': 10, 'LXC': 9}, 'AR..."
9,BWC[CHN/M],all,FL2MI-n:1.0,euclidean,283.17/713.97,74.00/181.00,29.76,30.36,33.13,31.08,1.798,"{'KOR': 77, 'CHN': 74, 'VTN': 30}","{'KOR': 77, 'CHN': 74, 'VTN': 30}","{'KOR': 77, 'CHN': 74, 'VTN': 30}","{'KOR': {'YKWK': 50, 'HKK': 27}, 'CHN': {'TXHC...","{'KOR': {'YKWK': 50, 'HKK': 27}, 'CHN': {'TXHC...","{'KOR': {'YKWK': 50, 'HKK': 27}, 'CHN': {'TXHC..."


In [10]:
df.to_csv(csv_name, index=False)
print(csv_name)

report_500_10_w2v2.csv


In [9]:
# speakers = [f.name for f in os.scandir('../accent/') if f.is_dir() and f.name != '.ipynb_checkpoints' and f.name != 'reserved_TSS_output']
# print(speakers)
# for speaker in speakers:
#     for budget in get_dirs('../accent/{}/manifests/TSS_output/all/'.format(speaker)):
#         print(budget)
#         for target in get_dirs('../accent/{}/manifests/TSS_output/all/{}/'.format(speaker, budget)):
#             print(target)