In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import re, os, csv, pathlib, ast, os.path, itertools, json, time
import pandas as pd
from statistics import mean, variance
from collections import Counter
from os import path
from datetime import datetime, timedelta

pd.set_option('display.max_rows', None)

def accent_from_path(json_path):
    pieces = json_path.split('/')
    piece = [piece for piece in pieces if 'english' in piece]
    piece = piece[0]
    return replace_with_short_forms(piece)

def WER_test_file(test_file):
    try:
        txt_file = open(test_file, 'r')
        lines = txt_file.readlines()
        matched = ""
        for line in lines:
            if "==========>>>>>>Evaluation Greedy WER: " in line:
                txt_file.close()
                return float(line.rstrip().split(": ")[1])
        txt_file.close()
    except:
        print("WER not found in {}".format(test_file))

        
        
def CER_test_file(test_file):
    try:
        txt_file = open(test_file, 'r')
        lines = txt_file.readlines()
        matched = ""
        for line in lines:
            if "==========>>>>>>Evaluation Greedy CER: " in line:
                txt_file.close()
                return float(line.rstrip().split(": ")[1])
        txt_file.close()
    except:
        print("CER not found in {}".format(test_file))
        

def accent_distribution(json_path):
    file = open(json_path, 'r')
    paths = [accent_from_path(json.loads(sample)['audio_filepath'])[:3] for sample in file.readlines()]
    counts = Counter(paths)
    return dict(counts.most_common())

def time_fraction(json_parent, speaker):
    total_duration, domain_duration = 0, 0
    for i in range(1,4):
        json_path = "{}/run_{}/train.json".format(json_parent, i)
        file = open(json_path, 'r')
        samples = [sample for sample in file.readlines()]
        total_duration += sum([json.loads(sample)['duration'] for sample in samples])
        domain_duration += sum([json.loads(sample)['duration'] for sample in samples if speaker in json.loads(sample)['audio_filepath'].split('/')])
#         print(speaker, json.loads(samples[0])['audio_filepath'])
    total_duration/=3
    domain_duration/=3
    return "{:.1f}/{:.1f}".format(domain_duration, total_duration)

def sample_fraction(json_parent, speaker):
    total, domain_counts = 0, 0
    for i in range(1,4):
        json_path = "{}/run_{}/train.json".format(json_parent, i)
        file = open(json_path, 'r')
        total += len([json.loads(sample) for sample in file.readlines()])
        domain_counts += len([json.loads(sample) for sample in file.readlines() if speaker in json.loads(
            sample)['audio_filepath'].split('/')])
    total/=3
    domain_counts/=3
    return "{:.1f}/{:.1f}".format(domain_counts, total)

def replace_with_short_forms(s):
    s=s.replace('english','')
    s=s.replace('_female_','[F]')
    s=s.replace('_male_','[M]')
    return s


def last_name(pth):
    return pathlib.PurePath(pth).name

def get_dirs(pth):
    return [last_name(f.name) for f in os.scandir(pth) if f.is_dir()]

def get_each_run(lne):
    return list(map(float, re.findall(': (.+?) -> ', lne)[0].split(' ')))

def get_selection_counts(s):
    return list(map(replace_with_short_forms, re.findall('Counter\\((.+?)\\)', s)))


def get_test(stats_file_path):
    return stats_file_path[:-9]+"run_1/output/test_out.txt"

def get_eta(func, eta):
    return "-n:"+str(float(eta[4:]))

In [3]:
pth='assamese_female_english/all/budget_2000/target_20/FL2MI/39/train.json'
accent_from_path(pth)

'assamese[F]'

In [5]:



# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/speaker/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# assamese_female_english/all/budget_2000/target_20/FL2MI/39/budget_200/error_model/run_1
# budget = 100


ft, b1, b2, t = '39', '3500', 150, 20
csv_name = "unfmted_rpt_{}_{}_{}_{}.csv".format(b1, t, ft, b2)

cols = ['speaker', 'ground', 'stage1', 'ft', 'stage2', 'duration',  
        'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-stdev', 'distn_1', 'distn_2', 'distn_3']
df = pd.DataFrame(columns = cols)

# speakers = ['assamese_female_english', 'gujarati_female_english', 'manipuri_female_english', 'hindi_male_english',
#             'rajasthani_male_english', 'tamil_male_english', 'kannada_male_english', 'malayalam_male_english']
speakers = ["assamese_female_english", "kannada_male_english"]
cnt = 0

speaker_pre_added={}
for speaker in speakers: 
    speaker_pre_added[speaker] = False

print(speakers)
for speaker in speakers:
    
    #starting with random
    pick_from = 'all'
    curr_dir = '{}/all/budget_{}/random'.format(speaker, b2)
    random_present = True
    for i in range(1,4):
        if not(pathlib.Path('{}/run_{}'.format(curr_dir, i)).is_dir()): 
            print("run_{} missing {}/run_{}".format(i, curr_dir, i))
            random_present = False
    if random_present:
        selection_stats = [accent_distribution('{}/run_{}/train.json'.format(curr_dir, i)) for i in range(1,4)]
        
        df_duration = time_fraction(curr_dir, speaker)
        df_samples = sample_fraction(curr_dir, speaker)
        try:
            wers = [WER_test_file('{}/run_{}/test_infer_log.txt'.format(curr_dir, i)) for i in range(1,4)]
            df_wer_mean = round(mean(wers), 2)
            df_wer_stdev = round(variance(wers)**0.5, 3)
        except:
            print("WER extraction failed ", curr_dir)
            wers = [0,0,0]
            df_wer_mean = 0
            df_wer_stdev = 999
#         if fun=='random': fun='rand'
        df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), pick_from, 'rand {}'.format(b2), '-', '-', df_duration]+wers
                                +[df_wer_mean, df_wer_stdev] + selection_stats)), ignore_index=True)    
    #starting with TSS
#     for fun, method in itertools.product(['FL2MI', 'GCMI', 'LogDMI'], ['random', 'top', 'error_model']):
#     for fun, method in itertools.product(['FL2MI'], ['random', 'top', 'error_model', 'div_tf_idf', 'w2v2_avg', 'true_wer', 'div_tf_idf_3gram_cosine', 'div_tf_idf_4gram_euclidean']):
    for fun, method in itertools.product(['FL2MI'], ['uniform', 'div_tf_idf_2gram_euclidean/LogDet', 'div_tf_idf_2gram_euclidean', 'div_tf_idf_3gram_euclidean', 'random']):
        if not(pathlib.Path('{}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}'.format(speaker, pick_from, b1, t, 
                                                                               fun, ft, b2, method)).is_dir()):
            print('expt_dr {}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}\n is not found'.format(speaker, pick_from, b1, t, fun, ft, b2, method))
            continue
        curr_dir = '{}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}'.format(speaker, pick_from, b1, t, 
                                                                               fun, ft, b2, method)
        for i in range(1,4):
            if not(pathlib.Path('{}/run_{}'.format(curr_dir, i)).is_dir()): print("l28 run_{} missing {}/run_{}".format(i, curr_dir, i))
        selection_stats = [accent_distribution('{}/run_{}/train.json'.format(curr_dir, i)) for i in range(1,4)]
        
        df_duration = time_fraction(curr_dir, speaker)
        df_samples = sample_fraction(curr_dir, speaker)
        try:
            wers = [WER_test_file('{}/run_{}/test_infer_log.txt'.format(curr_dir, i)) for i in range(1,4)]
            df_wer_mean = round(mean(wers), 2)
            df_wer_stdev = round(variance(wers)**0.5, 3)
        except:
            continue
            print("WER extraction failed ", curr_dir)
            wers = [0,0,0]
            df_wer_mean = 0
            df_wer_stdev = 999
#         print(fun)
        if method=='error_model': method='error'
        if method=='true_wer': method='wer_skyline'
#         if method=='w2v2_avg': method='w2v2'
        if method=='random': method='rand'
        df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), pick_from, "{} {}".format(fun,b1), 
                                       ft, "{} {}".format(method,b2), df_duration]+wers
                                +[df_wer_mean, df_wer_stdev] + selection_stats)), ignore_index=True)
        #pretrain vals
        if not(speaker_pre_added[speaker]):
            speaker_pre_added[speaker]=True
            df_wer_mean = WER_test_file('{}/TSS_output/pre_test_infer_log.txt'.format(speaker))
            df_wer_stdev = 0
            df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), 'all', '~pretrain', '-', '-', '-', '-',  '-',  '-']
                                +[df_wer_mean, df_wer_stdev] + ['-',  '-',  '-'])), ignore_index=True)
    

# cols = ['speaker', 'ground', 'stage1', 'ft', 'stage2', 'duration', 'samples', 
#         'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-stdev', 'distn_1', 'distn_2', 'distn_3']
custom_dict = {f'wer_skyline {b2}': 0, f'error {b2}': 1, f'top {b2}': 2, f'div_tf_idf {b2}': 3,  f'w2v2_avg {b2}': 4, 
               f'rand {b2}': 5, } 
df = df.sort_values(by=['stage2'], key=lambda x: x.map(custom_dict))
df = df.sort_values(by=['speaker', 'stage1', 'ft', 'ground'], ascending=[True, True, True, True], ignore_index=True)
# df = df.sort_values(by=['speaker', 'stage1', 'ft', 'ground', 'stage2'], ascending=[True, True, True, True, True], ignore_index=True)

display(df)

['assamese_female_english', 'kannada_male_english']
run_1 missing assamese_female_english/all/budget_150/random/run_1
run_2 missing assamese_female_english/all/budget_150/random/run_2
run_3 missing assamese_female_english/all/budget_150/random/run_3
expt_dr assamese_female_english/all/budget_3500/target_20/FL2MI/39/budget_150/div_tf_idf_2gram_euclidean/LogDet
 is not found
expt_dr assamese_female_english/all/budget_3500/target_20/FL2MI/39/budget_150/div_tf_idf_2gram_euclidean
 is not found
expt_dr assamese_female_english/all/budget_3500/target_20/FL2MI/39/budget_150/div_tf_idf_3gram_euclidean
 is not found
expt_dr assamese_female_english/all/budget_3500/target_20/FL2MI/39/budget_150/random
 is not found
run_1 missing kannada_male_english/all/budget_150/random/run_1
run_2 missing kannada_male_english/all/budget_150/random/run_2
run_3 missing kannada_male_english/all/budget_150/random/run_3
expt_dr kannada_male_english/all/budget_3500/target_20/FL2MI/39/budget_150/div_tf_idf_2gram_euclid

Unnamed: 0,speaker,ground,stage1,ft,stage2,duration,WER-r1,WER-r2,WER-r3,WER-mean,WER-stdev,distn_1,distn_2,distn_3
0,assamese[F],all,FL2MI 3500,39,uniform 150,739.0/739.0,19.81,19.63,19.62,19.69,0.107,{'ass': 180},{'ass': 180},{'ass': 180}
1,assamese[F],all,~pretrain,-,-,-,-,-,-,27.05,0.0,-,-,-
2,kannada[M],all,FL2MI 3500,39,uniform 150,683.8/739.3,13.66,13.39,13.68,13.58,0.162,"{'kan': 172, 'mal': 14}","{'kan': 172, 'mal': 14}","{'kan': 172, 'mal': 14}"
3,kannada[M],all,~pretrain,-,-,-,-,-,-,18.74,0.0,-,-,-


In [6]:
print(csv_name)
df.to_csv(csv_name, index=False)

unfmted_rpt_3500_20_39_150.csv


In [18]:



# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/speaker/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# assamese_female_english/all/budget_2000/target_20/FL2MI/39/budget_200/error_model/run_1
# budget = 100


ft, b1, b2, t = '39', 2000, 250, 20
csv_name = "unfmted_rpt_{}_{}_{}_{}.csv".format(b1, t, ft, b2)

cols = ['speaker', 'ground', 'stage1', 'ft', 'stage2', 'duration',  
        'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-stdev', 'distn_1', 'distn_2', 'distn_3']
df = pd.DataFrame(columns = cols)

speakers = ['assamese_female_english', 'gujarati_female_english', 'manipuri_female_english', 'hindi_male_english',
            'rajasthani_male_english', 'tamil_male_english', 'kannada_male_english', 'malayalam_male_english']
cnt = 0

speaker_pre_added={}
for speaker in speakers: 
    speaker_pre_added[speaker] = False

print(speakers)
for speaker in speakers:
    
    #starting with random
    pick_from = 'all'
    curr_dir = '{}/all/budget_{}/random'.format(speaker, b2)
    random_present = True
    for i in range(1,4):
        if not(pathlib.Path('{}/run_{}'.format(curr_dir, i)).is_dir()): 
            print("run_{} missing {}/run_{}".format(i, curr_dir, i))
            random_present = False
    if random_present:
        selection_stats = [accent_distribution('{}/run_{}/train.json'.format(curr_dir, i)) for i in range(1,4)]
        
        df_duration = time_fraction(curr_dir, speaker)
        df_samples = sample_fraction(curr_dir, speaker)
        try:
            wers = [WER_test_file('{}/run_{}/test_infer_log.txt'.format(curr_dir, i)) for i in range(1,4)]
            df_wer_mean = round(mean(wers), 2)
            df_wer_stdev = round(variance(wers)**0.5, 3)
        except:
            print("WER extraction failed ", curr_dir)
            wers = [0,0,0]
            df_wer_mean = 0
            df_wer_stdev = 999
#         if fun=='random': fun='rand'
        df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), pick_from, 'rand {}'.format(b2), '-', '-', df_duration]+wers
                                +[df_wer_mean, df_wer_stdev] + selection_stats)), ignore_index=True)    
    #starting with TSS
#     for fun, method in itertools.product(['FL2MI', 'GCMI', 'LogDMI'], ['random', 'top', 'error_model']):
    for fun, method in itertools.product(['FL2MI'], ['random', 'top', 'error_model', 'div_tf_idf', 'w2v2_avg', 'true_wer', 'div_tf_idf_3gram_cosine']):
        if not(pathlib.Path('{}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}'.format(speaker, pick_from, b1, t, 
                                                                               fun, ft, b2, method)).is_dir()):
            print('expt_dr {}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}\n is not found'.format(speaker, pick_from, b1, t, fun, ft, b2, method))
            continue
        curr_dir = '{}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}'.format(speaker, pick_from, b1, t, 
                                                                               fun, ft, b2, method)
        for i in range(1,4):
            if not(pathlib.Path('{}/run_{}'.format(curr_dir, i)).is_dir()): print("l28 run_{} missing {}/run_{}".format(i, curr_dir, i))
        selection_stats = [accent_distribution('{}/run_{}/train.json'.format(curr_dir, i)) for i in range(1,4)]
        
        df_duration = time_fraction(curr_dir, speaker)
        df_samples = sample_fraction(curr_dir, speaker)
        try:
            wers = [WER_test_file('{}/run_{}/test_infer_log.txt'.format(curr_dir, i)) for i in range(1,4)]
            df_wer_mean = round(mean(wers), 2)
            df_wer_stdev = round(variance(wers)**0.5, 3)
        except:
            continue
            print("WER extraction failed ", curr_dir)
            wers = [0,0,0]
            df_wer_mean = 0
            df_wer_stdev = 999
#         print(fun)
        if method=='error_model': method='error'
        if method=='true_wer': method='wer_skyline'
#         if method=='w2v2_avg': method='w2v2'
        if method=='random': method='rand'
        df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), pick_from, "{} {}".format(fun,b1), 
                                       ft, "{} {}".format(method,b2), df_duration]+wers
                                +[df_wer_mean, df_wer_stdev] + selection_stats)), ignore_index=True)
        #pretrain vals
        if not(speaker_pre_added[speaker]):
            speaker_pre_added[speaker]=True
            df_wer_mean = WER_test_file('{}/TSS_output/pre_test_infer_log.txt'.format(speaker))
            df_wer_stdev = 0
            df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), 'all', '~pretrain', '-', '-', '-', '-',  '-',  '-']
                                +[df_wer_mean, df_wer_stdev] + ['-',  '-',  '-'])), ignore_index=True)
    

# cols = ['speaker', 'ground', 'stage1', 'ft', 'stage2', 'duration', 'samples', 
#         'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-stdev', 'distn_1', 'distn_2', 'distn_3']
custom_dict = {f'wer_skyline {b2}': 0, f'error {b2}': 1, f'top {b2}': 2, f'div_tf_idf {b2}': 3,  f'w2v2_avg {b2}': 4, 
               f'rand {b2}': 5, } 
df = df.sort_values(by=['stage2'], key=lambda x: x.map(custom_dict))
df = df.sort_values(by=['speaker', 'stage1', 'ft', 'ground'], ascending=[True, True, True, True], ignore_index=True)
# df = df.sort_values(by=['speaker', 'stage1', 'ft', 'ground', 'stage2'], ascending=[True, True, True, True, True], ignore_index=True)

display(df)

['assamese_female_english', 'gujarati_female_english', 'manipuri_female_english', 'hindi_male_english', 'rajasthani_male_english', 'tamil_male_english', 'kannada_male_english', 'malayalam_male_english']


Unnamed: 0,speaker,ground,stage1,ft,stage2,duration,WER-r1,WER-r2,WER-r3,WER-mean,WER-stdev,distn_1,distn_2,distn_3
0,assamese[F],all,FL2MI 2000,39,wer_skyline 250,1235.1/1235.1,17.65,17.66,17.46,17.59,0.113,{'ass': 234},{'ass': 234},{'ass': 234}
1,assamese[F],all,FL2MI 2000,39,error 250,1232.5/1232.5,17.76,17.8,18.64,18.07,0.497,{'ass': 235},{'ass': 219},{'ass': 229}
2,assamese[F],all,FL2MI 2000,39,top 250,1228.0/1228.0,18.51,18.45,18.79,18.58,0.181,{'ass': 208},{'ass': 208},{'ass': 208}
3,assamese[F],all,FL2MI 2000,39,div_tf_idf 250,1235.8/1235.8,18.77,18.78,19.01,18.85,0.136,{'ass': 221},{'ass': 221},{'ass': 221}
4,assamese[F],all,FL2MI 2000,39,w2v2_avg 250,1230.3/1230.3,18.13,18.01,18.08,18.07,0.06,{'ass': 212},{'ass': 212},{'ass': 212}
5,assamese[F],all,FL2MI 2000,39,rand 250,1227.1/1227.1,18.3,18.91,18.46,18.56,0.316,{'ass': 227},{'ass': 223},{'ass': 228}
6,assamese[F],all,FL2MI 2000,39,div_tf_idf_3gram_cosine 250,1231.7/1231.7,18.87,19.01,18.52,18.8,0.252,{'ass': 220},{'ass': 220},{'ass': 220}
7,assamese[F],all,rand 250,-,-,201.6/1227.0,22.04,21.21,21.53,21.59,0.419,"{'man': 58, 'ass': 38, 'tam': 34, 'raj': 27, '...","{'ass': 48, 'tam': 42, 'man': 42, 'raj': 27, '...","{'man': 53, 'tam': 39, 'ass': 30, 'guj': 29, '..."
8,assamese[F],all,~pretrain,-,-,-,-,-,-,27.05,0.0,-,-,-
9,gujarati[F],all,FL2MI 2000,39,wer_skyline 250,1232.7/1236.4,8.74,8.79,8.62,8.72,0.087,"{'guj': 164, 'kan': 1}","{'guj': 164, 'kan': 1}","{'guj': 164, 'kan': 1}"


In [21]:
df.to_csv(csv_name, index=False)

## Checking for CERS for once

In [20]:
ft, b1, b2, t = '39', 2000, 250, 20
csv_name = "unfmted_rpt_cer_{}_{}_{}_{}.csv".format(b1, t, ft, b2)

cols = ['speaker', 'ground', 'stage1', 'ft', 'stage2', 'duration',  
        'CER-r1', 'CER-r2', 'CER-r3', 'CER-mean', 'CER-stdev', 'distn_1', 'distn_2', 'distn_3']
df = pd.DataFrame(columns = cols)

speakers = ['assamese_female_english', 'gujarati_female_english', 'manipuri_female_english', 'hindi_male_english',
            'rajasthani_male_english', 'tamil_male_english', 'kannada_male_english', 'malayalam_male_english']
cnt = 0

speaker_pre_added={}
for speaker in speakers: 
    speaker_pre_added[speaker] = False

print(speakers)
for speaker in speakers:
    
    #starting with random
    pick_from = 'all'
    curr_dir = '{}/all/budget_{}/random'.format(speaker, b2)
    random_present = True
    for i in range(1,4):
        if not(pathlib.Path('{}/run_{}'.format(curr_dir, i)).is_dir()): 
            print("run_{} missing {}/run_{}".format(i, curr_dir, i))
            random_present = False
    if random_present:
        selection_stats = [accent_distribution('{}/run_{}/train.json'.format(curr_dir, i)) for i in range(1,4)]
        
        df_duration = time_fraction(curr_dir, speaker)
        df_samples = sample_fraction(curr_dir, speaker)
        try:
            cers = [CER_test_file('{}/run_{}/test_infer_log.txt'.format(curr_dir, i)) for i in range(1,4)]
            df_cer_mean = round(mean(cers), 2)
            df_cer_stdev = round(variance(cers)**0.5, 3)
        except:
            print("CER extraction failed ", curr_dir)
            cers = [0,0,0]
            df_cer_mean = 0
            df_cer_stdev = 999
#         if fun=='random': fun='rand'
        df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), pick_from, 'rand {}'.format(b2), '-', '-', df_duration]+cers
                                +[df_cer_mean, df_cer_stdev] + selection_stats)), ignore_index=True)    
    #starting with TSS
#     for fun, method in itertools.product(['FL2MI', 'GCMI', 'LogDMI'], ['random', 'top', 'error_model']):
    for fun, method in itertools.product(['FL2MI'], ['random', 'top', 'error_model', 'div_tf_idf_3gram_cosine', 'w2v2_avg', 'true_wer']):
        if not(pathlib.Path('{}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}'.format(speaker, pick_from, b1, t, 
                                                                               fun, ft, b2, method)).is_dir()):
            print('expt_dr {}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}\n is not found'.format(speaker, pick_from, b1, t, fun, ft, b2, method))
            continue
        curr_dir = '{}/{}/budget_{}/target_{}/{}/{}/budget_{}/{}'.format(speaker, pick_from, b1, t, 
                                                                               fun, ft, b2, method)
        for i in range(1,4):
            if not(pathlib.Path('{}/run_{}'.format(curr_dir, i)).is_dir()): print("l28 run_{} missing {}/run_{}".format(i, curr_dir, i))
        selection_stats = [accent_distribution('{}/run_{}/train.json'.format(curr_dir, i)) for i in range(1,4)]
        
        df_duration = time_fraction(curr_dir, speaker)
        df_samples = sample_fraction(curr_dir, speaker)
        try:
            cers = [CER_test_file('{}/run_{}/test_infer_log.txt'.format(curr_dir, i)) for i in range(1,4)]
            df_cer_mean = round(mean(cers), 2)
            df_cer_stdev = round(variance(cers)**0.5, 3)
        except:
            continue
            print("CER extraction failed ", curr_dir)
            cers = [0,0,0]
            df_cer_mean = 0
            df_cer_stdev = 999
#         print(fun)
        if method=='error_model': method='error'
        if method=='true_wer': method='wer_skyline'
#         if method=='w2v2_avg': method='w2v2'
        if method=='random': method='rand'
        df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), pick_from, "{} {}".format(fun,b1), 
                                       ft, "{} {}".format(method,b2), df_duration]+cers
                                +[df_cer_mean, df_cer_stdev] + selection_stats)), ignore_index=True)
        #pretrain vals
        if not(speaker_pre_added[speaker]):
            speaker_pre_added[speaker]=True
            df_cer_mean = CER_test_file('{}/TSS_output/pre_test_infer_log.txt'.format(speaker))
            df_cer_stdev = 0
            df = df.append(dict(zip(cols, [replace_with_short_forms(speaker), 'all', '~pretrain', '-', '-', '-', '-',  '-',  '-']
                                +[df_cer_mean, df_cer_stdev] + ['-',  '-',  '-'])), ignore_index=True)
    

# cols = ['speaker', 'ground', 'stage1', 'ft', 'stage2', 'duration', 'samples', 
#         'WER-r1', 'WER-r2', 'WER-r3', 'WER-mean', 'WER-stdev', 'distn_1', 'distn_2', 'distn_3']
custom_dict = {f'wer_skyline {b2}': 0, f'error {b2}': 1, f'top {b2}': 2, f'div_tf_idf {b2}': 3,  f'w2v2_avg {b2}': 4, 
               f'rand {b2}': 5, } 
df = df.sort_values(by=['stage2'], key=lambda x: x.map(custom_dict))
df = df.sort_values(by=['speaker', 'CER-mean', 'CER-stdev', 'stage1', 'ft', 'ground'], ascending=[True, True, True, True, True, True], ignore_index=True)
# df = df.sort_values(by=['speaker', 'stage1', 'ft', 'ground', 'stage2'], ascending=[True, True, True, True, True], ignore_index=True)

display(df)

['assamese_female_english', 'gujarati_female_english', 'manipuri_female_english', 'hindi_male_english', 'rajasthani_male_english', 'tamil_male_english', 'kannada_male_english', 'malayalam_male_english']


Unnamed: 0,speaker,ground,stage1,ft,stage2,duration,CER-r1,CER-r2,CER-r3,CER-mean,CER-stdev,distn_1,distn_2,distn_3
0,assamese[F],all,FL2MI 2000,39,wer_skyline 250,1235.1/1235.1,6.63,6.63,6.55,6.6,0.046,{'ass': 234},{'ass': 234},{'ass': 234}
1,assamese[F],all,FL2MI 2000,39,error 250,1232.5/1232.5,6.53,6.62,7.13,6.76,0.324,{'ass': 235},{'ass': 219},{'ass': 229}
2,assamese[F],all,FL2MI 2000,39,w2v2_avg 250,1230.3/1230.3,6.82,6.89,6.76,6.82,0.065,{'ass': 212},{'ass': 212},{'ass': 212}
3,assamese[F],all,FL2MI 2000,39,rand 250,1227.1/1227.1,6.84,7.24,6.72,6.93,0.272,{'ass': 227},{'ass': 223},{'ass': 228}
4,assamese[F],all,FL2MI 2000,39,top 250,1228.0/1228.0,7.06,6.99,7.19,7.08,0.101,{'ass': 208},{'ass': 208},{'ass': 208}
5,assamese[F],all,FL2MI 2000,39,div_tf_idf_3gram_cosine 250,1231.7/1231.7,7.2,7.26,6.98,7.15,0.147,{'ass': 220},{'ass': 220},{'ass': 220}
6,assamese[F],all,rand 250,-,-,201.6/1227.0,8.52,7.97,8.28,8.26,0.276,"{'man': 58, 'ass': 38, 'tam': 34, 'raj': 27, '...","{'ass': 48, 'tam': 42, 'man': 42, 'raj': 27, '...","{'man': 53, 'tam': 39, 'ass': 30, 'guj': 29, '..."
7,assamese[F],all,~pretrain,-,-,-,-,-,-,10.36,0.0,-,-,-
8,gujarati[F],all,FL2MI 2000,39,wer_skyline 250,1232.7/1236.4,2.84,2.9,2.82,2.85,0.042,"{'guj': 164, 'kan': 1}","{'guj': 164, 'kan': 1}","{'guj': 164, 'kan': 1}"
9,gujarati[F],all,FL2MI 2000,39,top 250,1224.4/1224.4,2.99,2.96,3.01,2.99,0.025,{'guj': 140},{'guj': 140},{'guj': 140}
