In [27]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import re, os, csv, pathlib
import pandas as pd
from statistics import mean, variance
accents = ['african', 'indian', 'hongkong', 'philippines', 
           'england', 'scotland', 'ireland', 'australia', 
           'canada', 'us', 
           'bermuda', 'southatlandtic', 'wales', 'malaysia']
accent_short_forms = {'hongkong':'hk', "african":"afr", "philippines":"phil", "indian":"ind",  
                      "england":"eng", 'scotland':'sco', 'ireland':'ire',
                      "us":"us", "canada":"can", "australia":"aus",
                      'bermuda':'ber', 'southatlandtic':'satl', 'wales':'wal', 'malaysia':'mal'
                     }

def replace_with_short_forms(s):
    for key, value in accent_short_forms.items():
        s = s.replace(key, value)
    return s

def last_name(pth):
    return pathlib.PurePath(pth).name

def get_dirs(pth):
    return [last_name(f.name) for f in os.scandir(pth) if f.is_dir()]

def get_each_run(lne):
    return list(map(float, re.findall(': (.+?) -> ', lne)[0].split(' ')))

def get_selection_counts(s):
    return list(map(replace_with_short_forms, re.findall('Counter\\((.+?)\\)', s)))

def get_test_file_from_stats_path(run_number, stats_file_opened):
    return stats_file_opened.name[:-9]+"run_{}/output/test_infer_log.txt".format(run_number)

def WER_test_file(test_file):
    txt_file = open(test_file, 'r')
    lines = txt_file.readlines()
    matched = ""
    for line in lines:
        if "==========>>>>>>Evaluation Greedy WER: " in line:
            txt_file.close()
            return float(line.rstrip().split(": ")[1])
    txt_file.close()
    return ""

def get_eta(func, eta):
    if func != "FL1MI":
        return ""
    else:
        return "-n:"+str(float(eta[4:]))


In [28]:
budget = 200
target = 20
features = 'wv10_100'
csv_name = "with_{}_{}_{}.csv".format(budget, target, features)

In [29]:
cols = ['accent', 'ground', 'function', 'feature', 'duration', 'samples', 
        'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-var', 'accents_run1', 'accents_run2', 'accents_run3']
df = pd.DataFrame(columns = cols)

accents = ['african', 'indian', 'hongkong', 'philippines', 
           'england', 'scotland', 'ireland', 'australia', 
           'canada', 'us', 
           'bermuda', 'southatlandtic', 'wales', 'malaysia']

for accent in accents:
    if 'within' not in get_dirs('./{}/manifests/TSS_output/'.format(accent)):
        print("no within results for {}".format(accent))
        continue
    if not(pathlib.Path('./{}/manifests/TSS_output/within/budget_{}/target_{}/'.format(accent, budget, target)).is_dir()):
        continue
    for function in get_dirs('./{}/manifests/TSS_output/within/budget_{}/target_{}/'.format(accent, budget, target)):
        stats_file_path='./{}/manifests/TSS_output/within/budget_{}/target_{}/{}/stats.txt'.format(accent, budget, target, function)
        try:
            stats_file = open(stats_file_path, 'r')
        except:
            continue
        lines = stats_file.readlines()
        df_selections = get_selection_counts(lines[5])
        total_selections, total_durations, accented_selections, accented_durations = map(get_each_run, lines[:4])
        sample_frac = mean([x[0]/x[1] for x in zip(accented_selections, total_selections)])
        sample_total = mean(total_selections)
        duration_frac = mean([x[0]/x[1] for x in zip(accented_durations, total_durations)])
        duration_total = mean(total_durations)
        df_duration = "{:.2f}/{:.2f}".format(duration_total*duration_frac, duration_total)
        df_samples = "{:.2f}/{:.2f}".format(sample_total*sample_frac, sample_total)
        try:
            wers = [WER_test_file(get_test_file_from_stats_path(i, stats_file)) for i in range(1,4)]
            wer_floats = [x for x in wers if type(x)==float or type(x)==int]
            df_wer_mean = round(mean(wer_floats), 2)
            df_wer_var = round(variance(wer_floats), 3)
        except:
            print("no WER's in file", get_test_file_from_stats_path(1, stats_file))
#             continue
            wers = [0,0,0]
            df_wer_mean = 0
            df_wer_var = 999
        df = df.append(dict(zip(cols, [accent, "within", function, 
                        "NA", df_duration, df_samples]+ wers+[df_wer_mean, round(df_wer_var**0.5, 3)] + df_selections)), 
                       ignore_index=True)
        stats_file.close()

df = df.sort_values(by=['accent', 'feature', 'ground', 'function'], ascending=True, ignore_index=True)
display(df)

no WER's in file ./england/manifests/TSS_output/within/budget_200/target_20/self/run_1/output/test_infer_log.txt
no WER's in file ./ireland/manifests/TSS_output/within/budget_200/target_20/self/run_1/output/test_infer_log.txt
no WER's in file ./us/manifests/TSS_output/within/budget_200/target_20/self/run_1/output/test_infer_log.txt


Unnamed: 0,accent,ground,function,feature,duration,samples,WER-r1,WER-r2,WER-r3,WER-mean,WER-var,accents_run1,accents_run2,accents_run3
0,african,within,self,,1077.35/1077.35,218.67/218.67,21.69,23.28,22.41,22.46,0.796,{'afr': 220},{'afr': 217},{'afr': 219}
1,australia,within,self,,1077.07/1077.07,219.33/219.33,25.52,27.01,25.76,26.1,0.8,{'aus': 216},{'aus': 232},{'aus': 210}
2,canada,within,self,,1077.01/1077.01,210.33/210.33,16.69,16.73,16.86,16.76,0.089,{'can': 211},{'can': 210},{'can': 210}
3,england,within,self,,1076.67/1076.67,225.33/225.33,0.0,0.0,0.0,0.0,31.607,{'eng': 219},{'eng': 228},{'eng': 229}
4,hongkong,within,self,,1076.20/1076.20,236.67/236.67,29.24,28.33,28.39,28.65,0.509,{'hk': 239},{'hk': 233},{'hk': 238}
5,indian,within,self,,1075.37/1075.37,213.33/213.33,38.32,38.14,38.19,38.22,0.095,{'ind': 215},{'ind': 203},{'ind': 222}
6,ireland,within,self,,1076.40/1076.40,227.33/227.33,0.0,0.0,0.0,0.0,31.607,{'ire': 229},{'ire': 229},{'ire': 224}
7,philippines,within,self,,1078.76/1078.76,210.00/210.00,28.45,29.38,28.77,28.87,0.472,{'phil': 211},{'phil': 211},{'phil': 208}
8,scotland,within,self,,1077.70/1077.70,198.67/198.67,45.0,45.74,45.52,45.42,0.379,{'sco': 205},{'sco': 190},{'sco': 201}
9,us,within,self,,1078.19/1078.19,229.67/229.67,0.0,0.0,0.0,0.0,31.607,{'us': 233},{'us': 231},{'us': 225}


In [30]:
# total selection : 100 100 100 -> 100.00
# total selection duration: 357.0149433106577 357.0149433106577 357.0149433106577 -> 357.01
# accented selection: 76 76 76 -> 76.00
# accented duration: 254.74947845804974 254.74947845804974 254.74947845804974 -> 254.75

# all selections: [Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3}), Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3}), Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3})]

#Evaluation Greedy WER: 16.19
df2 = pd.read_csv(csv_name)
print(csv_name)
df2=df2.append(df)
df2 = df2.sort_values(by=['accent', 'ground', 'feature', 'function'], ascending=True, ignore_index=True)
# df2.assign(accent = df['accent'].split('[')[1].split('/')[0] + df['two']**2).sort_values('accent')
display(df2)


with_200_20_wv10_100.csv


Unnamed: 0,accent,ground,function,feature,duration,samples,WER-r1,WER-r2,WER-r3,WER-mean,WER-var,accents-run_1,accents-run_2,accents-run_3,accents_run1,accents_run2,accents_run3
0,african,all,FL2MI,wv10_100,960.90/1077.61,178.00/204.00,22.5,22.6,22.54,22.55,0.003,"{'afr': 178, 'eng': 9, 'us': 7, 'aus': 7, 'ind...","{'afr': 178, 'eng': 9, 'us': 7, 'aus': 7, 'ind...","{'afr': 178, 'eng': 9, 'us': 7, 'aus': 7, 'ind...",,,
1,african,all,GCMI,wv10_100,948.66/1076.64,175.00/199.00,23.34,22.0,22.43,22.59,0.468,"{'afr': 175, 'aus': 9, 'us': 8, 'eng': 6, 'ind...","{'afr': 175, 'aus': 9, 'us': 8, 'eng': 6, 'ind...","{'afr': 175, 'aus': 9, 'us': 8, 'eng': 6, 'ind...",,,
2,african,all,LogDMI,wv10_100,624.74/1077.02,123.00/224.00,22.64,22.23,22.74,22.54,0.073,"{'afr': 123, 'eng': 36, 'us': 31, 'aus': 15, '...","{'afr': 123, 'eng': 36, 'us': 31, 'aus': 15, '...","{'afr': 123, 'eng': 36, 'us': 31, 'aus': 15, '...",,,
3,african,all,random,,42.57/1077.05,7.69/228.00,23.07,23.67,22.93,23.22,0.155,"{'us': 91, 'eng': 47, 'ind': 27, 'can': 14, 'a...","{'us': 94, 'eng': 37, 'ind': 31, 'aus': 12, 's...","{'us': 94, 'ind': 34, 'eng': 31, 'can': 20, 's...",,,
4,african,pretrain,,,,,-,-,-,23.75,,,,,,,
5,african,within,self,,1077.35/1077.35,218.67/218.67,21.69,23.28,22.41,22.46,0.796,,,,{'afr': 220},{'afr': 217},{'afr': 219}
6,australia,all,FL2MI,wv10_100,24.30/1078.55,6.00/239.00,28.99,28.99,28.99,28.99,0.0,"{'us': 178, 'eng': 33, 'can': 16, 'aus': 6, 'a...","{'us': 178, 'eng': 33, 'can': 16, 'aus': 6, 'a...","{'us': 178, 'eng': 33, 'can': 16, 'aus': 6, 'a...",,,
7,australia,all,GCMI,wv10_100,134.80/1077.38,27.00/225.00,26.99,26.75,26.71,26.82,0.023,"{'us': 107, 'eng': 61, 'aus': 27, 'can': 13, '...","{'us': 107, 'eng': 61, 'aus': 27, 'can': 13, '...","{'us': 107, 'eng': 61, 'aus': 27, 'can': 13, '...",,,
8,australia,all,LogDMI,wv10_100,121.30/1077.17,25.00/233.00,28.54,28.11,27.57,28.07,0.236,"{'us': 85, 'eng': 71, 'aus': 25, 'afr': 17, 'i...","{'us': 85, 'eng': 71, 'aus': 25, 'afr': 17, 'i...","{'us': 85, 'eng': 71, 'aus': 25, 'afr': 17, 'i...",,,
9,australia,all,random,,57.57/1077.05,12.01/228.00,28.52,27.87,27.68,28.02,0.194,"{'us': 91, 'eng': 47, 'ind': 27, 'can': 14, 'a...","{'us': 94, 'eng': 37, 'ind': 31, 'aus': 12, 's...","{'us': 94, 'ind': 34, 'eng': 31, 'can': 20, 's...",,,


In [31]:

df2.to_csv(csv_name, index=False)
# input_file = csv.DictReader(open(csv_name))