In [1]:
from IPython.core.display import display, HTML
import os
from pathlib import Path
from cmath import inf
import re, os, csv, pathlib, ast, os.path
import pandas as pd
from statistics import mean, variance
import json
from collections import Counter
import numpy as np

display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', None)


  from IPython.core.display import display, HTML


In [2]:
def accent_from_path(json_path):
    pieces = json_path.split('/')
    piece = [piece for piece in pieces if 'english' in piece]
    piece = piece[0]
    return replace_with_short_forms(piece)

def replace_with_short_forms(s):
    s = s.replace("english", "")
    s = s.replace("_female_", "[F]")
    s = s.replace("_male_", "[M]")
    return s


# def shortened_speaker(s):
#     speaker, gender, _ = s.split("_")
#     return speaker[0].upper() + speaker[1:] + "[" + gender[0].upper() + "]"


def last_name(pth):
    return pathlib.PurePath(pth).name


def get_dirs(pth):
    return [last_name(f.name) for f in os.scandir(pth) if f.is_dir()]


def get_each_run(lne):
    # print(lne.strip())
    # print(re.findall(": (.+)", lne))
    # print(re.findall(": (.+)", lne)[0])
    # print(list(map(float, re.findall(": (.+)", lne)[0].split(" "))))

    return list(map(float, re.findall(": (.+)", lne)[0].split(" ")))


def get_selection_counts(s):
    return list(map(replace_with_short_forms, re.findall("Counter\\((.+?)\\)", s)))


def get_test_file_from_stats_path(run_number, stats_file_opened):
    return stats_file_opened.name[:-9] + "run_{}/test_infer_log.txt".format(run_number)


# def get_test(stats_file_path):
#     return stats_file_path[:-9] + "run_1/output/test_out.txt"


def WER_test_file(test_file):
    try:
        txt_file = open(test_file, "r")
        lines = txt_file.readlines()
        matched = ""
        for line in lines:
            if "==========>>>>>>Evaluation Greedy WER: " in line:
                txt_file.close()
                return float(line.rstrip().split(": ")[1])
    except:
        txt_file.close()
        print("weiowdnio")
        return inf


def get_eta(func, eta):
    return "-n:" + str(float(eta[4:]))


def accent_distribution(json_path):
    file = open(json_path, 'r')
    paths = [accent_from_path(json.loads(sample)['audio_filepath'])[:3] for sample in file.readlines()]
    counts = Counter(paths)
    return dict(counts.most_common())

def time_fraction(json_parent, speaker):
    total_duration, domain_duration = 0, 0
    for i in range(1,4):
        json_path = "{}/run_{}/train.json".format(json_parent, i)
        file = open(json_path, 'r')
        samples = [sample for sample in file.readlines()]
        total_duration += sum([json.loads(sample)['duration'] for sample in samples])
        domain_duration += sum([json.loads(sample)['duration'] for sample in samples if speaker in json.loads(sample)['audio_filepath'].split('/')])
#         print(speaker, json.loads(samples[0])['audio_filepath'])
    total_duration/=3
    domain_duration/=3
    return "{:.1f}/{:.1f}".format(domain_duration, total_duration)

def sample_fraction(json_parent, speaker):
    # print(speaker)
    total, domain_counts = 0, 0
    for i in range(1,4):
        json_path = "{}/run_{}/train.json".format(json_parent, i)
        file = open(json_path, 'r')
        lines = [line for line in file.readlines()]
        total += len(lines)
        domain_counts += len([json.loads(sample) for sample in lines if speaker in json.loads(
            sample)['audio_filepath'].split('/')])
    total/=3
    domain_counts/=3
    return "{:.1f}/{:.1f}".format(domain_counts, total)



In [3]:
def get_target(path):
    try:
        search_pattern = r"/target_\d*/"
        matched_string = re.search(search_pattern, path).group()
        new_path = re.sub(search_pattern, os.path.sep, path)
        target = matched_string.split('_')[1][:-1]
        return new_path, target
    except:
        return path, "-"

In [4]:
# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/speaker/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# CMU_expts/speaker_without/ABA/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/run_1/
# budget = 100


# csv_name = "mod_report_{}_{}.csv".format(budget, target)

# df = pd.DataFrame(columns=cols)

speakers = [
    "assamese_female_english",
    "manipuri_female_english",
    "kannada_male_english",
    "rajasthani_male_english",
    "hindi_male_english",
    "malayalam_male_english",
    "tamil_male_english",
    "gujarati_female_english",
]

expt_results = []
os_sep = os.path.sep
submod_fxns = ("FL1MI", "FL2MI", "GCMI", "LogDMI")

for speaker in speakers:
    shortened_speaker = replace_with_short_forms(speaker)
    print("Extracting experiments from {}".format(shortened_speaker))
    base_dir = os.path.join(".", f"{speaker}", "all")
    all_paths = list(set([f"{os.path.sep}".join(str(p).split(os.path.sep)[:-2]) for p in Path(base_dir).rglob("*/test_infer_log.txt")]))
    for path in all_paths:
        dct = {}
        orig_path = str(path)
        path = str(path).replace(f"{speaker}/all/", "")
        dct["speaker"] = shortened_speaker
        
        if path.startswith("budget_"):
            dct["method"] = "-"
        else:
            if path.startswith("dim_phoneme_gains"):
                dct["method"] = "phone_decay-"
                path = path.replace(f"{path.split(os.path.sep)[0]}/", "")
                tau_str = path.split(os.path.sep)[0]
                tau_val = tau_str.split('_')[-1]
                dct["method"] += tau_val
                path = path.replace(f"{path.split(os.path.sep)[0]}/", "")
#             print(path)
            else:
                dct["method"] = path.split(os.path.sep)[0]
                path = path.replace(f"{dct['method']}/", "")
            dct["other_accents"] = replace_with_short_forms(path.split(os.path.sep)[0])
            path = path.replace(f"{path.split(os.path.sep)[0]}/", "")
        assert(path.startswith("budget_"))
        dct["budget_b1"] = path.split(os.path.sep)[0].replace("budget_", "")
        path = path.replace(f"budget_{dct['budget_b1']}/", "")
#         print(path, dct)
        if path.startswith("random"):
            continue
        dct["target"] = path.split(os.path.sep)[0].replace("target_", "")
        path = path.replace(f"target_{dct['target']}/", "")
        fxn = path.split(os.path.sep)[0]
        if fxn in submod_fxns:
            dct["fxn"] = fxn
#             print(fxn)
            dct["etaScale"] = "default(1.0)"
            
        else:
            ls = fxn.split("_")
            assert(len(ls) == 3)
            assert(ls[1] == "etaScale")
            assert(ls[0] in submod_fxns)
            dct["fxn"] = ls[0]
            dct["etaScale"] = ls[2]
#             print(ls[2])
        path = path.replace(f"{fxn}/", "")
        
        if path.startswith("39/"):
            dct["accent_features"] = "39"
            path = path.replace(f"{dct['accent_features']}/", "")
        else:
            assert(path.startswith("accent_"))
            dct["accent_features"] = path.split(os.path.sep)[0].replace("accent_", "")
            path = path.replace(f"accent_{dct['accent_features']}/", "")
        
        if path.startswith("budget_"):
            # print(path)
            dct["budget_b2"] = path.split(os.path.sep)[0].replace("budget_", "")
            path = path.replace(f"budget_{dct['budget_b2']}/", "")
            # print(path)
            dct["method"] = path.split(os.path.sep)[0] + "(stage2)"
            # print(dct["method"])
        else:
            assert(path.startswith("content_"))
            dct["content_features"] = path.split(os.path.sep)[0].replace("content_", "")
            path = path.replace(f"content_{dct['content_features']}/", "")
    #         print(path, dct)
            if path.startswith("phoneme_"):
                path = path.replace(f"{path.split(os.path.sep)[0]}/", "")
            
            kernels = path.split(os.path.sep)[0].replace("kernel_", "")
            kernels_ls = "; ".join(kernels.split("_"))
            dct["kernels"] = kernels_ls
            path = path.replace(f"kernel_{kernels}/", "")
            assert(path.startswith("accent_"))
            dct["accent_sim"] = path.split(os.path.sep)[0].replace("accent_", "")
            path = path.replace(f"accent_{dct['accent_sim']}/", "")
            
            assert(path.startswith("content_"))
            dct["content_sim"] = path.split(os.path.sep)[0].replace("content_", "")
            path = path.replace(f"content_{dct['content_sim']}/", "")
        
        
        
        
        try:
            dct["time_fraction"] = time_fraction(orig_path, speaker)
            dct["sample_fraction"] = sample_fraction(orig_path, speaker)
            
            dct["speakers"] = accent_distribution(os.path.join(orig_path, "run_1/train.json"))
            
            wer_list = []
            for run in range(1,4):
                test_file_path = os.path.join(orig_path, f"run_{run}", "test_infer_log.txt")
                wer_list.append(WER_test_file(test_file_path))
            
    #         print(wer_list, len(wer_list))
                
            mean = np.nanmean(wer_list)
            var = np.nanvar(wer_list)
            dct["WER-mean"] = round(mean, 2)
            dct["WER-stdev"] = round(var**0.5, 2)
            for run in range(1, 4):
                dct[f"WER-r{run}"] = wer_list[run - 1]
            
            expt_results.append(dct)
            
        except:
            print(orig_path, dct["method"], dct["budget_b1"])
        
        
        
        
        
        
        
    print("End of {}".format(shortened_speaker))


Extracting experiments from assamese[F]
assamese_female_english/all/dim_phoneme_gains/tau_1.0/assamese_female_english=1/budget_150/target_20/GCMI_etaScale_1.0/accent_39/content_tf_idf_2gram/kernel_g=accent_gq=accent_qq=accent/accent_cosine/content_cosine phone_decay-1.0 150
assamese_female_english/all/mixed_query_set/assamese_female_english=0.5-hindi_male_english=0.5/budget_150/target_30/FL2MI_etaScale_1.0/accent_39/content_tf_idf_2gram/kernel_g=accent_gq=accent_qq=accent/accent_cosine/content_cosine mixed_query_set 150
assamese_female_english/all/mixed_query_set/assamese_female_english=0.5-hindi_male_english=0.5/budget_150/target_30/FL1MI_etaScale_1.0/accent_39/content_tf_idf_2gram/kernel_g=accent_gq=accent_qq=accent/accent_cosine/content_cosine mixed_query_set 150
End of assamese[F]
Extracting experiments from manipuri[F]
manipuri_female_english/all/budget_3500/target_20/FL2MI/39/budget_150/uniform_0.075 uniform_0.075(stage2) 3500
manipuri_female_english/all/dim_phoneme_gains/tau_1.0

In [5]:
cols = [
    "speaker",
    "other_accents",
    "budget_b1",
    "target",
    "fxn",
    "method",
    "budget_b2",
    "etaScale",
    "accent_features",
    "content_features",
    "accent_sim",
    "content_sim",
    "kernels",
    "time_fraction",
    "sample_fraction",
    "WER-r1",
    "WER-r2",
    "WER-r3",
    "WER-mean",
    "WER-stdev",
    "speakers",
]

df = pd.DataFrame(expt_results, columns=cols)

In [6]:
df

Unnamed: 0,speaker,other_accents,budget_b1,target,fxn,method,budget_b2,etaScale,accent_features,content_features,...,content_sim,kernels,time_fraction,sample_fraction,WER-r1,WER-r2,WER-r3,WER-mean,WER-stdev,speakers
0,assamese[F],assamese[F]=0.5-hindi[M]=0.5,150,20,LogDMI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,330.1/735.8,75.0/171.0,20.07,20.87,20.15,20.36,0.36,"{'ass': 75, 'hin': 68, 'tam': 11, 'raj': 10, '..."
1,assamese[F],assamese[F]=0.5-hindi[M]=0.5,750,50,LogDMI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,1437.9/3687.9,323.0/825.0,17.67,17.85,17.87,17.8,0.09,"{'hin': 373, 'ass': 323, 'tam': 48, 'raj': 26,..."
2,assamese[F],,3500,20,FL2MI,div_tf_idf(stage2),250.0,default(1.0),39,,...,,,1231.4/1231.4,233.0/233.0,19.51,19.53,19.35,19.46,0.08,{'ass': 233}
3,assamese[F],assamese[F]=1,150,20,FL2MI,phone_decay-0.1,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,199.3/737.5,31.0/87.0,22.86,22.74,22.81,22.8,0.05,"{'tam': 33, 'ass': 31, 'guj': 14, 'kan': 3, 'r..."
4,assamese[F],assamese[F]=1,150,20,FL2MI,phone_decay-0.4,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,610.7/732.0,100.0/112.0,19.51,19.55,19.42,19.49,0.05,"{'ass': 100, 'tam': 11, 'raj': 1}"
5,assamese[F],,3500,20,FL2MI,div_tf_idf_3gram_cosine(stage2),250.0,default(1.0),39,,...,,,1234.5/1234.5,230.0/230.0,19.02,19.04,19.06,19.04,0.02,{'ass': 230}
6,assamese[F],assamese[F]=1,150,20,GCMI,phone_decay-1.0,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,693.2/734.3,114.0/118.0,19.86,20.24,19.84,19.98,0.18,"{'ass': 114, 'tam': 4}"
7,assamese[F],,2000,20,FL2MI,random(stage2),200.0,default(1.0),39,,...,,,982.7/982.7,177.3/177.3,18.11,18.1,18.11,18.11,0.0,{'ass': 178}
8,assamese[F],,150,20,FL1MI,-,,1.0,39_3rep,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,705.3/734.6,127.0/134.0,19.14,19.8,19.21,19.38,0.3,"{'ass': 127, 'man': 5, 'guj': 1, 'tam': 1}"
9,assamese[F],,2000,20,FL2MI,div_tf_idf_3gram_cosine(stage2),250.0,default(1.0),39,,...,,,1231.7/1231.7,220.0/220.0,18.87,19.01,18.52,18.8,0.21,{'ass': 220}


In [8]:
mask = (df["method"] == "mixed_query_set") & (df["budget_b1"] == "750")
filtered_df = df.loc[mask]
filtered_df = filtered_df.sort_values(["other_accents", "fxn", "speaker"])
filtered_df

Unnamed: 0,speaker,other_accents,budget_b1,target,fxn,method,budget_b2,etaScale,accent_features,content_features,...,content_sim,kernels,time_fraction,sample_fraction,WER-r1,WER-r2,WER-r3,WER-mean,WER-stdev,speakers
50,assamese[F],assamese[F]=0.5-hindi[M]=0.5,750,50,FL1MI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,2819.4/3689.2,550.0/728.0,16.38,17.86,18.34,17.53,0.83,"{'ass': 550, 'hin': 158, 'mal': 6, 'raj': 5, '..."
209,hindi[M],assamese[F]=0.5-hindi[M]=0.5,750,50,FL1MI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,770.4/3689.2,158.0/728.0,9.85,9.77,9.74,9.79,0.05,"{'ass': 550, 'hin': 158, 'mal': 6, 'raj': 5, '..."
13,assamese[F],assamese[F]=0.5-hindi[M]=0.5,750,50,FL2MI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,1090.4/3688.5,183.0/712.0,18.35,18.4,19.2,18.65,0.39,"{'hin': 528, 'ass': 183, 'guj': 1}"
219,hindi[M],assamese[F]=0.5-hindi[M]=0.5,750,50,FL2MI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,2595.3/3688.5,528.0/712.0,8.04,8.08,8.0,8.04,0.03,"{'hin': 528, 'ass': 183, 'guj': 1}"
41,assamese[F],assamese[F]=0.5-hindi[M]=0.5,750,50,GCMI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,38.2/3688.0,9.0/883.0,21.35,21.35,21.13,21.28,0.1,"{'man': 476, 'kan': 166, 'tam': 136, 'guj': 32..."
182,hindi[M],assamese[F]=0.5-hindi[M]=0.5,750,50,GCMI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,67.9/3688.0,30.0/883.0,9.4,9.4,9.49,9.43,0.04,"{'man': 476, 'kan': 166, 'tam': 136, 'guj': 32..."
1,assamese[F],assamese[F]=0.5-hindi[M]=0.5,750,50,LogDMI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,1437.9/3687.9,323.0/825.0,17.67,17.85,17.87,17.8,0.09,"{'hin': 373, 'ass': 323, 'tam': 48, 'raj': 26,..."
185,hindi[M],assamese[F]=0.5-hindi[M]=0.5,750,50,LogDMI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,1816.7/3687.9,373.0/825.0,9.18,9.03,9.17,9.13,0.07,"{'hin': 373, 'ass': 323, 'tam': 48, 'raj': 26,..."
173,rajasthani[M],rajasthani[M]=0.5-tamil[M]=0.5,750,50,FL1MI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,950.2/3686.2,157.0/685.0,15.16,15.17,15.11,15.15,0.03,"{'ass': 341, 'tam': 166, 'raj': 157, 'man': 9,..."
275,tamil[M],rajasthani[M]=0.5-tamil[M]=0.5,750,50,FL1MI,mixed_query_set,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,955.1/3686.2,166.0/685.0,12.05,12.04,12.05,12.05,0.0,"{'ass': 341, 'tam': 166, 'raj': 157, 'man': 9,..."


In [10]:
filtered_df.to_csv("budget_750_mixed_query_set.csv", index=False)

In [15]:
mask = (df["method"].str.startswith("phone_decay"))
# mask = (((df["method"].str.endswith("random(stage2)")) | (df["method"].str.startswith("uniform"))) & (df["budget_b1"] == "3500") & (df["budget_b2"] == "150")) 
filtered_df = df.loc[mask]
# filtered_df

In [16]:
filtered_df = filtered_df.sort_values(["other_accents", "fxn", "speaker"])
filtered_df

Unnamed: 0,speaker,other_accents,budget_b1,target,fxn,method,budget_b2,etaScale,accent_features,content_features,...,content_sim,kernels,time_fraction,sample_fraction,WER-r1,WER-r2,WER-r3,WER-mean,WER-stdev,speakers
5,assamese[F],assamese[F]=1,150,20,FL2MI,phone_decay-1.0,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,679.5/734.2,111.0/116.0,20.05,20.33,19.88,20.09,0.19,"{'ass': 111, 'tam': 4, 'guj': 1}"
8,assamese[F],assamese[F]=1,150,20,FL2MI,phone_decay-0.2,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,465.3/733.3,76.0/103.0,20.52,20.61,20.12,20.42,0.21,"{'ass': 76, 'tam': 20, 'guj': 5, 'raj': 1, 'ka..."
16,assamese[F],assamese[F]=1,150,20,FL2MI,phone_decay-0.1,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,199.3/737.5,31.0/87.0,22.86,22.74,22.81,22.8,0.05,"{'tam': 33, 'ass': 31, 'guj': 14, 'kan': 3, 'r..."
31,assamese[F],assamese[F]=1,150,20,FL2MI,phone_decay-0.3,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,553.9/733.9,88.0/106.0,19.66,19.83,20.28,19.92,0.26,"{'ass': 88, 'tam': 14, 'guj': 3, 'kan': 1}"
38,assamese[F],assamese[F]=1,150,20,FL2MI,phone_decay-0.4,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,610.7/732.0,100.0/112.0,19.51,19.55,19.42,19.49,0.05,"{'ass': 100, 'tam': 11, 'raj': 1}"
45,assamese[F],assamese[F]=1,150,20,FL2MI,phone_decay-0.5,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,640.0/734.3,103.0/113.0,20.18,19.66,19.56,19.8,0.27,"{'ass': 103, 'tam': 9, 'kan': 1}"
22,assamese[F],assamese[F]=1,150,20,GCMI,phone_decay-1.0,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,693.2/734.3,114.0/118.0,19.86,20.24,19.84,19.98,0.18,"{'ass': 114, 'tam': 4}"
24,assamese[F],assamese[F]=1,150,20,GCMI,phone_decay-0.2,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,478.2/733.6,79.0/105.0,21.01,21.92,21.14,21.36,0.4,"{'ass': 79, 'tam': 18, 'guj': 6, 'kan': 1, 'ma..."
289,gujarati[F],gujarati[F]=1,150,20,FL2MI,phone_decay-0.2,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,562.7/728.5,54.0/71.0,10.46,10.5,10.51,10.49,0.02,"{'guj': 54, 'tam': 12, 'kan': 2, 'ass': 2, 'ra..."
318,gujarati[F],gujarati[F]=1,150,20,FL2MI,phone_decay-1.0,,1.0,39,tf_idf_2gram,...,cosine,g=accent; gq=accent; qq=accent,706.7/736.4,80.0/83.0,9.89,10.08,9.93,9.97,0.08,"{'guj': 80, 'tam': 3}"


In [14]:
filtered_df.to_csv("report-21st-oct2022.csv", index=False)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
x = list(range(1, len(df["etaScale"]) + 1))
plt.errorbar(x, df["WER-mean"], df["WER-stdev"], marker="o", ecolor="red")
plt.xticks(x, df["etaScale"])
# plt.yscale('log')
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
x = list(range(1, len(df["etaScale"]) + 1))
plt.plot(x, df["WER-mean"])
plt.xticks(x, df["etaScale"])
# plt.yscale('log')
plt.show()


In [None]:
# total selection : 100 100 100 -> 100.00
# total selection duration: 357.0149433106577 357.0149433106577 357.0149433106577 -> 357.01
# speakered selection: 76 76 76 -> 76.00
# speakered duration: 254.74947845804974 254.74947845804974 254.74947845804974 -> 254.75

# all selections: [Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3}), Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3}), Counter({'hindi': 76, 'korean': 8, 'spanish': 7, 'arabic': 3, 'chinese': 3, 'vietnamese': 3})]

# Evaluation Greedy WER: 16.19

df.to_csv(csv_name, index=False)


In [None]:
# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/speaker/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# CMU_expts/speaker_without/ABA/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/run_1/
# budget = 100


budget = 150
# target = 50
ngram = 2
target = 20
# base_eta = "423.28"
# etaScales = [
#     "0.1",
#     "0.2",
#     "0.3",
#     "0.4",
#     "0.5",
#     "0.6",
#     "0.7",
#     "0.8",
#     "0.9",
#     "1.0",
#     "2.0",
#     "3.0",
#     "4.0",
#     "5.0",
#     "6.0",
#     "7.0",
#     "8.0",
#     "9.0",
#     "10.0",
# ]

# features = 'TRILL'
csv_name = "mod_report_{}_{}.csv".format(budget, target)

cols = [
    "speaker",
    "function",
    "base_eta",
    "etaScale",
    "target",
    # "accent_features",
    # "content_features",
    # "accent_similairty",
    # "content_similarity",
    "duration",
    "samples",
    "WER-r1",
    "WER-r2",
    "WER-r3",
    "WER-mean",
    "WER-stdev",
    "speakers",
]
df = pd.DataFrame(columns=cols)

speakers = [
    "assamese_female_english",
    # "manipuri_female_english",
    "kannada_male_english",
    # "rajasthani_male_english",
    # "hindi_male_english",
    # "malayalam_male_english",
    # "tamil_male_english",
    # "gujarati_female_english",
]


for speaker in speakers:
    if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/").is_dir()):
        continue
    pick_from = "all"
    if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/target_{target}/").is_dir()):
        continue
    for function in get_dirs(f"./{speaker}/all/budget_{budget}/target_{target}/"):
        (func, base_eta, etaScale) = split_function(function)
        for accent_features in get_dirs(
            f"./{speaker}/all/budget_{budget}/target_{target}/{function}/"
        ):
            for content_features in get_dirs(
                f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}"
            ):
                for accent_similarity in get_dirs(
                    f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/"
                ):
                    for content_similarity in get_dirs(
                        f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{accent_similarity}"
                    ):
                        stats_file_path = f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{accent_similarity}/{content_similarity}/stats.txt"
                        if not (os.path.isfile(stats_file_path)):
                            continue
                        stats_file = open(stats_file_path, "r")
                        lines = stats_file.readlines()
                        # print("lines length ", len(lines), lines)
                        (
                            total_selections,
                            total_durations,
                            speakered_selections,
                            speakered_durations,
                        ) = map(get_each_run, lines[:4])
                        # print(total_selections, total_durations, speakered_selections, speakered_durations)
                        sample_frac = mean(
                            [
                                x[0] / x[1]
                                for x in zip(speakered_selections, total_selections)
                            ]
                        )
                        sample_total = mean(total_selections)
                        duration_frac = mean(
                            [
                                x[0] / x[1]
                                for x in zip(speakered_durations, total_durations)
                            ]
                        )
                        duration_total = mean(total_durations)
                        df_duration = "{:.2f}/{:.2f}".format(
                            duration_total * duration_frac, duration_total
                        )
                        df_samples = "{:.2f}/{:.2f}".format(
                            sample_total * sample_frac, sample_total
                        )
                        df_selections = get_selection_counts(lines[4])
                        try:
                            wers = [
                                WER_test_file(
                                    get_test_file_from_stats_path(i, stats_file)
                                )
                                for i in range(1, 4)
                            ]
                            df_wer_mean = round(mean(wers), 2)
                            df_wer_stdev = round(variance(wers), 3) ** 0.5
                            df = df.append(
                                dict(
                                    zip(
                                        cols,
                                        [
                                            speaker,
                                            func,
                                            base_eta,
                                            etaScale,
                                            target,
                                            # accent_features,
                                            # content_features,
                                            # accent_similarity,
                                            # content_similarity,
                                            df_duration,
                                            df_samples,
                                        ]
                                        + wers
                                        + [df_wer_mean, df_wer_stdev]
                                        + df_selections,
                                    )
                                ),
                                ignore_index=True,
                            )
                        except:
                            #                     continue
                            print(
                                "no WER's in file",
                                get_test_file_from_stats_path(1, stats_file),
                            )
                            wers = [0, 0, 0]
                            df_wer_mean = 0
                            df_wer_stdev = 0
                        # df = df.append(
                        #     dict(
                        #         zip(
                        #             cols,
                        #             [
                        #                 speaker,
                        #                 func,
                        #                 base_eta,
                        #                 etaScale,
                        #                 target,
                        #                 # accent_features,
                        #                 # content_features,
                        #                 # accent_similarity,
                        #                 # content_similarity,
                        #                 df_duration,
                        #                 df_samples,
                        #             ]
                        #             + wers
                        #             + [df_wer_mean, df_wer_stdev]
                        #             + df_selections,
                        #         )
                        #     ),
                        #     ignore_index=True,
                        # )
                        stats_file.close()
df = df.sort_values(
    by=[
        "speaker",
        # "accent_features",
        # "content_features",
        "function",
        "base_eta",
        "etaScale",
    ],
    ascending=True,
    ignore_index=True,
)
display(df)

In [None]:
def get_unique_percent(json_file):
    with open(json_file) as file:
        lines = file.readlines()
    unq = set(lines)
    return len(unq)/len(lines)


# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/speaker/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# CMU_expts/speaker_without/ABA/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/run_1/
# budget = 100

## This cell is for the duplication report results


budget = 150
# target = 50
ngram = 2
target = 20
# base_eta = "423.28"
# etaScales = [
#     "0.1",
#     "0.2",
#     "0.3",
#     "0.4",
#     "0.5",
#     "0.6",
#     "0.7",
#     "0.8",
#     "0.9",
#     "1.0",
#     "2.0",
#     "3.0",
#     "4.0",
#     "5.0",
#     "6.0",
#     "7.0",
#     "8.0",
#     "9.0",
#     "10.0",
# ]

# features = 'TRILL'
csv_name = "mod_report_{}_{}.csv".format(budget, target)

cols = [
    "speaker",
    "function",
    "etaScale",
    "target",
    # "accent_features",
    # "content_features",
    # "accent_similairty",
    # "content_similarity",
    # "g_kernel",
    # "gq_kernel",
    # "qq_kernel",
    "duration",
    "samples",
    "WER-r1",
    "WER-r2",
    "WER-r3",
    "WER-mean",
    "WER-stdev",
    "speakers",
    "unique_percent",
]
df = pd.DataFrame(columns=cols)

speakers = [
    "assamese_female_english",
    "manipuri_female_english",
    "kannada_male_english",
    "rajasthani_male_english",
    "hindi_male_english",
    "malayalam_male_english",
    "tamil_male_english",
    "gujarati_female_english",
]

for speaker in speakers:
    if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/").is_dir()):
        continue
    pick_from = "all"
    if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/target_{target}/").is_dir()):
        continue
    for function in get_dirs(f"./{speaker}/all/budget_{budget}/target_{target}/"):
        if(len(function.split('_')) != 3): continue
        # print(function, function.split('_'), len(function.split('_')))
        (func, etaScale) = (function.split('_')[0], function.split('_')[2])
        for accent_features in get_dirs(
            f"./{speaker}/all/budget_{budget}/target_{target}/{function}/"
        ):
            if not accent_features.endswith("_3rep"):
                continue
            for content_features in get_dirs(
                f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}"
            ):
                for kernel_type in get_dirs(
                    f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/"
                ):
                    for accent_similarity in get_dirs(
                        f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}"
                    ):
                        for content_similarity in get_dirs(
                            f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}"
                        ):
                            json_file = f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}/{content_similarity}/train.json"
                            unique_percent = get_unique_percent(json_file)
                            stats_file_path = f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}/{content_similarity}/stats.txt"
                            if not (os.path.isfile(stats_file_path)):
                                continue
                            stats_file = open(stats_file_path, "r")
                            lines = stats_file.readlines()
                            # print("lines length ", len(lines), lines)
                            (
                                total_selections,
                                total_durations,
                                speakered_selections,
                                speakered_durations,
                            ) = map(get_each_run, lines[:4])
                            # print(total_selections, total_durations, speakered_selections, speakered_durations)
                            sample_frac = mean(
                                [
                                    x[0] / x[1]
                                    for x in zip(speakered_selections, total_selections)
                                ]
                            )
                            sample_total = mean(total_selections)
                            duration_frac = mean(
                                [
                                    x[0] / x[1]
                                    for x in zip(speakered_durations, total_durations)
                                ]
                            )
                            duration_total = mean(total_durations)
                            df_duration = "{:.2f}/{:.2f}".format(
                                duration_total * duration_frac, duration_total
                            )
                            df_samples = "{:.2f}/{:.2f}".format(
                                sample_total * sample_frac, sample_total
                            )
                            df_selections = get_selection_counts(lines[4])

                            wers = []
                            for i in range(1, 4):
                                try:
                                    wers.append(WER_test_file(get_test_file_from_stats_path(i, stats_file)))
                                except:
                                    print(
                                        "no WER's in file",
                                        get_test_file_from_stats_path(1, stats_file),
                                    )
                            
                            df_wer_mean = round(inf if len(wers) == 0  else mean(wers), 2)
                            df_wer_stdev = round(inf if len(wers) <= 1 else variance(wers), 3) ** 0.5
                            while(len(wers)<3): wers.append(0)
                            # print(wers, speaker, func)
                            
                            df = df.append(
                                dict(
                                    zip(
                                        cols,
                                        [
                                            speaker,
                                            func,
                                            etaScale,
                                            target,
                                            # accent_features,
                                            # content_features,
                                            # accent_similarity,
                                            # content_similarity,
                                            df_duration,
                                            df_samples,
                                        ]
                                        + wers
                                        + [df_wer_mean, df_wer_stdev]
                                        + df_selections
                                        + [unique_percent],
                                    )
                                ),
                                ignore_index=True,
                            )
                                # wers = [0, 0, 0]
                                # df_wer_mean = 0
                                # df_wer_stdev = 0
                            # df = df.append(
                            #     dict(
                            #         zip(
                            #             cols,
                            #             [
                            #                 speaker,
                            #                 func,
                            #                 base_eta,
                            #                 etaScale,
                            #                 target,
                            #                 # accent_features,
                            #                 # content_features,
                            #                 # accent_similarity,
                            #                 # content_similarity,
                            #                 df_duration,
                            #                 df_samples,
                            #             ]
                            #             + wers
                            #             + [df_wer_mean, df_wer_stdev]
                            #             + df_selections,
                            #         )
                            #     ),
                            #     ignore_index=True,
                            # )
                            stats_file.close()
df = df.sort_values(
    by=[
        "speaker",
        # "accent_features",
        # "content_features",
        "function",
        "etaScale",
    ],
    ascending=True,
    ignore_index=True,
)
display(df)


In [None]:
df.to_csv("mod_rep_csv_150_20.csv")

In [None]:
def get_unique_percent(json_file):
    with open(json_file) as file:
        lines = file.readlines()
    unq = set(lines)
    return len(unq)/len(lines)


# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/speaker/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# CMU_expts/speaker_without/ABA/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/run_1/
# budget = 100

## This cell is for the duplication report results


budget = 150
# target = 50
ngram = 2
target = 20
# base_eta = "423.28"
# etaScales = [
#     "0.1",
#     "0.2",
#     "0.3",
#     "0.4",
#     "0.5",
#     "0.6",
#     "0.7",
#     "0.8",
#     "0.9",
#     "1.0",
#     "2.0",
#     "3.0",
#     "4.0",
#     "5.0",
#     "6.0",
#     "7.0",
#     "8.0",
#     "9.0",
#     "10.0",
# ]

# features = 'TRILL'
csv_name = "mod_report_{}_{}.csv".format(budget, target)

cols = [
    "speaker",
    "function",
    "etaScale",
    "target",
    # "accent_features",
    # "content_features",
    # "accent_similairty",
    # "content_similarity",
    # "g_kernel",
    # "gq_kernel",
    # "qq_kernel",
    "duration",
    "samples",
    "WER-r1",
    "WER-r2",
    "WER-r3",
    "WER-mean",
    "WER-stdev",
    "speakers",
    "unique_percent",
]
df = pd.DataFrame(columns=cols)

speakers = [
    "assamese_female_english",
    "manipuri_female_english",
    "kannada_male_english",
    "rajasthani_male_english",
    "hindi_male_english",
    "malayalam_male_english",
    "tamil_male_english",
    "gujarati_female_english",
]

for speaker in speakers:
    if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/").is_dir()):
        continue
    pick_from = "all"
    if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/target_{target}/").is_dir()):
        continue
    for function in get_dirs(f"./{speaker}/all/budget_{budget}/target_{target}/"):
        if(len(function.split('_')) != 3): continue
        # print(function, function.split('_'), len(function.split('_')))
        (func, etaScale) = (function.split('_')[0], function.split('_')[2])
        for accent_features in get_dirs(
            f"./{speaker}/all/budget_{budget}/target_{target}/{function}/"
        ):
            # if not accent_features.endswith("_3rep"):
            #     continue
            for content_features in get_dirs(
                f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}"
            ):
                for kernel_type in get_dirs(
                    f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/"
                ):
                    for accent_similarity in get_dirs(
                        f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}"
                    ):
                        for content_similarity in get_dirs(
                            f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}"
                        ):
                            json_file = f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}/{content_similarity}/train.json"
                            unique_percent = get_unique_percent(json_file)
                            stats_file_path = f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}/{content_similarity}/stats.txt"
                            if not (os.path.isfile(stats_file_path)):
                                continue
                            stats_file = open(stats_file_path, "r")
                            lines = stats_file.readlines()
                            # print("lines length ", len(lines), lines)
                            (
                                total_selections,
                                total_durations,
                                speakered_selections,
                                speakered_durations,
                            ) = map(get_each_run, lines[:4])
                            # print(total_selections, total_durations, speakered_selections, speakered_durations)
                            sample_frac = mean(
                                [
                                    x[0] / x[1]
                                    for x in zip(speakered_selections, total_selections)
                                ]
                            )
                            sample_total = mean(total_selections)
                            duration_frac = mean(
                                [
                                    x[0] / x[1]
                                    for x in zip(speakered_durations, total_durations)
                                ]
                            )
                            duration_total = mean(total_durations)
                            df_duration = "{:.2f}/{:.2f}".format(
                                duration_total * duration_frac, duration_total
                            )
                            df_samples = "{:.2f}/{:.2f}".format(
                                sample_total * sample_frac, sample_total
                            )
                            df_selections = get_selection_counts(lines[4])

                            wers = []
                            for i in range(1, 4):
                                try:
                                    wers.append(WER_test_file(get_test_file_from_stats_path(i, stats_file)))
                                except:
                                    print(
                                        "no WER's in file",
                                        get_test_file_from_stats_path(1, stats_file),
                                    )
                            
                            df_wer_mean = round(inf if len(wers) == 0  else mean(wers), 2)
                            df_wer_stdev = round(inf if len(wers) <= 1 else variance(wers), 3) ** 0.5
                            while(len(wers)<3): wers.append(0)
                            # print(wers, speaker, func)
                            
                            df = df.append(
                                dict(
                                    zip(
                                        cols,
                                        [
                                            speaker,
                                            func,
                                            etaScale,
                                            target,
                                            # accent_features,
                                            # content_features,
                                            # accent_similarity,
                                            # content_similarity,
                                            df_duration,
                                            df_samples,
                                        ]
                                        + wers
                                        + [df_wer_mean, df_wer_stdev]
                                        + df_selections
                                        + [unique_percent],
                                    )
                                ),
                                ignore_index=True,
                            )
                                # wers = [0, 0, 0]
                                # df_wer_mean = 0
                                # df_wer_stdev = 0
                            # df = df.append(
                            #     dict(
                            #         zip(
                            #             cols,
                            #             [
                            #                 speaker,
                            #                 func,
                            #                 base_eta,
                            #                 etaScale,
                            #                 target,
                            #                 # accent_features,
                            #                 # content_features,
                            #                 # accent_similarity,
                            #                 # content_similarity,
                            #                 df_duration,
                            #                 df_samples,
                            #             ]
                            #             + wers
                            #             + [df_wer_mean, df_wer_stdev]
                            #             + df_selections,
                            #         )
                            #     ),
                            #     ignore_index=True,
                            # )
                            stats_file.close()
df = df.sort_values(
    by=[
        "speaker",
        # "accent_features",
        # "content_features",
        "function",
        "etaScale",
    ],
    ascending=True,
    ignore_index=True,
)
display(df)


In [None]:
df.to_csv("mix_query_set.csv", index=False)

In [None]:
def get_unique_percent(json_file):
    with open(json_file) as file:
        lines = file.readlines()
    unq = set(lines)
    return len(unq)/len(lines)


# sample_path = 'Error-Driven-ASR-Personalization/CMU_expts/speaker/hindi/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/stats.txt'
# CMU_expts/speaker_without/ABA/manifests/TSS_output/all/budget_100/target_50/FL1MI/eta_1.0/euclidean/39/run_1/
# budget = 100

## This cell is for the duplication report results


budget = 150
# target = 50
ngram = 2
target = 20
# base_eta = "423.28"
# etaScales = [
#     "0.1",
#     "0.2",
#     "0.3",
#     "0.4",
#     "0.5",
#     "0.6",
#     "0.7",
#     "0.8",
#     "0.9",
#     "1.0",
#     "2.0",
#     "3.0",
#     "4.0",
#     "5.0",
#     "6.0",
#     "7.0",
#     "8.0",
#     "9.0",
#     "10.0",
# ]

# features = 'TRILL'
csv_name = "mod_report_{}_{}.csv".format(budget, target)

cols = [
    "speaker",
    "function",
    "etaScale",
    "target",
    # "accent_features",
    # "content_features",
    # "accent_similairty",
    # "content_similarity",
    # "g_kernel",
    # "gq_kernel",
    # "qq_kernel",
    "duration",
    "samples",
    "WER-r1",
    "WER-r2",
    "WER-r3",
    "WER-mean",
    "WER-stdev",
    "speakers",
    "unique_percent",
]
df = pd.DataFrame(columns=cols)

speakers = [
    "assamese_female_english",
    "manipuri_female_english",
    "kannada_male_english",
    "rajasthani_male_english",
    "hindi_male_english",
    "malayalam_male_english",
    "tamil_male_english",
    "gujarati_female_english",
]

for speaker in speakers:
    if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/").is_dir()):
        continue
    pick_from = "all"
    if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/target_{target}/").is_dir()):
        continue
    for function in get_dirs(f"./{speaker}/all/budget_{budget}/target_{target}/"):
        if(len(function.split('_')) != 3): continue
        # print(function, function.split('_'), len(function.split('_')))
        (func, etaScale) = (function.split('_')[0], function.split('_')[2])
        if etaScale != "1.0":
            continue
        for accent_features in get_dirs(
            f"./{speaker}/all/budget_{budget}/target_{target}/{function}/"
        ):
            # if not accent_features.endswith("_3rep"):
            #     continue
            for content_features in get_dirs(
                f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}"
            ):
                for kernel_type in get_dirs(
                    f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/"
                ):
                    for accent_similarity in get_dirs(
                        f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}"
                    ):
                        for content_similarity in get_dirs(
                            f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}"
                        ):
                            json_file = f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}/{content_similarity}/train.json"
                            unique_percent = get_unique_percent(json_file)
                            stats_file_path = f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{kernel_type}/{accent_similarity}/{content_similarity}/stats.txt"
                            if not (os.path.isfile(stats_file_path)):
                                continue
                            stats_file = open(stats_file_path, "r")
                            lines = stats_file.readlines()
                            # print("lines length ", len(lines), lines)
                            (
                                total_selections,
                                total_durations,
                                speakered_selections,
                                speakered_durations,
                            ) = map(get_each_run, lines[:4])
                            # print(total_selections, total_durations, speakered_selections, speakered_durations)
                            sample_frac = mean(
                                [
                                    x[0] / x[1]
                                    for x in zip(speakered_selections, total_selections)
                                ]
                            )
                            sample_total = mean(total_selections)
                            duration_frac = mean(
                                [
                                    x[0] / x[1]
                                    for x in zip(speakered_durations, total_durations)
                                ]
                            )
                            duration_total = mean(total_durations)
                            df_duration = "{:.2f}/{:.2f}".format(
                                duration_total * duration_frac, duration_total
                            )
                            df_samples = "{:.2f}/{:.2f}".format(
                                sample_total * sample_frac, sample_total
                            )
                            df_selections = get_selection_counts(lines[4])

                            wers = []
                            for i in range(1, 4):
                                try:
                                    wers.append(WER_test_file(get_test_file_from_stats_path(i, stats_file)))
                                except:
                                    print(
                                        "no WER's in file",
                                        get_test_file_from_stats_path(1, stats_file),
                                    )
                            
                            df_wer_mean = round(inf if len(wers) == 0  else mean(wers), 2)
                            df_wer_stdev = round(inf if len(wers) <= 1 else variance(wers), 3) ** 0.5
                            while(len(wers)<3): wers.append(0)
                            # print(wers, speaker, func)
                            
                            df = df.append(
                                dict(
                                    zip(
                                        cols,
                                        [
                                            speaker,
                                            func,
                                            etaScale,
                                            target,
                                            # accent_features,
                                            # content_features,
                                            # accent_similarity,
                                            # content_similarity,
                                            df_duration,
                                            df_samples,
                                        ]
                                        + wers
                                        + [df_wer_mean, df_wer_stdev]
                                        + df_selections
                                        + [unique_percent],
                                    )
                                ),
                                ignore_index=True,
                            )
                                # wers = [0, 0, 0]
                                # df_wer_mean = 0
                                # df_wer_stdev = 0
                            # df = df.append(
                            #     dict(
                            #         zip(
                            #             cols,
                            #             [
                            #                 speaker,
                            #                 func,
                            #                 base_eta,
                            #                 etaScale,
                            #                 target,
                            #                 # accent_features,
                            #                 # content_features,
                            #                 # accent_similarity,
                            #                 # content_similarity,
                            #                 df_duration,
                            #                 df_samples,
                            #             ]
                            #             + wers
                            #             + [df_wer_mean, df_wer_stdev]
                            #             + df_selections,
                            #         )
                            #     ),
                            #     ignore_index=True,
                            # )
                            stats_file.close()
df = df.sort_values(
    by=[
        "speaker",
        # "accent_features",
        # "content_features",
        "function",
        "etaScale",
    ],
    ascending=True,
    ignore_index=True,
)
display(df)


In [None]:
print(os.path.sep)

In [None]:

#     if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/").is_dir()):
#         continue
#     pick_from = "all"
#     if not (pathlib.Path(f"./{speaker}/all/budget_{budget}/target_{target}/").is_dir()):
#         continue
#     for function in get_dirs(f"./{speaker}/all/budget_{budget}/target_{target}/"):
#         (func, base_eta, etaScale) = split_function(function)
#         for accent_features in get_dirs(
#             f"./{speaker}/all/budget_{budget}/target_{target}/{function}/"
#         ):
#             for content_features in get_dirs(
#                 f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}"
#             ):
#                 for accent_similarity in get_dirs(
#                     f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/"
#                 ):
#                     for content_similarity in get_dirs(
#                         f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{accent_similarity}"
#                     ):
#                         stats_file_path = f"./{speaker}/all/budget_{budget}/target_{target}/{function}/{accent_features}/{content_features}/{accent_similarity}/{content_similarity}/stats.txt"
#                         if not (os.path.isfile(stats_file_path)):
#                             continue
#                         stats_file = open(stats_file_path, "r")
#                         lines = stats_file.readlines()
#                         # print("lines length ", len(lines), lines)
#                         (
#                             total_selections,
#                             total_durations,
#                             speakered_selections,
#                             speakered_durations,
#                         ) = map(get_each_run, lines[:4])
#                         # print(total_selections, total_durations, speakered_selections, speakered_durations)
#                         sample_frac = mean(
#                             [
#                                 x[0] / x[1]
#                                 for x in zip(speakered_selections, total_selections)
#                             ]
#                         )
#                         sample_total = mean(total_selections)
#                         duration_frac = mean(
#                             [
#                                 x[0] / x[1]
#                                 for x in zip(speakered_durations, total_durations)
#                             ]
#                         )
#                         duration_total = mean(total_durations)
#                         df_duration = "{:.2f}/{:.2f}".format(
#                             duration_total * duration_frac, duration_total
#                         )
#                         df_samples = "{:.2f}/{:.2f}".format(
#                             sample_total * sample_frac, sample_total
#                         )
#                         df_selections = get_selection_counts(lines[4])
#                         try:
#                             wers = [
#                                 WER_test_file(
#                                     get_test_file_from_stats_path(i, stats_file)
#                                 )
#                                 for i in range(1, 4)
#                             ]
#                             df_wer_mean = round(mean(wers), 2)
#                             df_wer_stdev = round(variance(wers), 3) ** 0.5
#                             df = df.append(
#                                 dict(
#                                     zip(
#                                         cols,
#                                         [
#                                             speaker,
#                                             func,
#                                             base_eta,
#                                             etaScale,
#                                             target,
#                                             # accent_features,
#                                             # content_features,
#                                             # accent_similarity,
#                                             # content_similarity,
#                                             df_duration,
#                                             df_samples,
#                                         ]
#                                         + wers
#                                         + [df_wer_mean, df_wer_stdev]
#                                         + df_selections,
#                                     )
#                                 ),
#                                 ignore_index=True,
#                             )
#                         except:
#                             #                     continue
#                             print(
#                                 "no WER's in file",
#                                 get_test_file_from_stats_path(1, stats_file),
#                             )
#                             wers = [0, 0, 0]
#                             df_wer_mean = 0
#                             df_wer_stdev = 0
#                         # df = df.append(
#                         #     dict(
#                         #         zip(
#                         #             cols,
#                         #             [
#                         #                 speaker,
#                         #                 func,
#                         #                 base_eta,
#                         #                 etaScale,
#                         #                 target,
#                         #                 # accent_features,
#                         #                 # content_features,
#                         #                 # accent_similarity,
#                         #                 # content_similarity,
#                         #                 df_duration,
#                         #                 df_samples,
#                         #             ]
#                         #             + wers
#                         #             + [df_wer_mean, df_wer_stdev]
#                         #             + df_selections,
#                         #         )
#                         #     ),
#                         #     ignore_index=True,
#                         # )
#                         stats_file.close()
# df = df.sort_values(
#     by=[
#         "speaker",
#         # "accent_features",
#         # "content_features",
#         "function",
#         "base_eta",
#         "etaScale",
#     ],
#     ascending=True,
#     ignore_index=True,
# )
# display(df)


In [None]:
speakers = [
    "assamese_female_english",
    "manipuri_female_english",
    "kannada_male_english",
    "rajasthani_male_english",
    "hindi_male_english",
    "malayalam_male_english",
    "tamil_male_english",
    "gujarati_female_english",
]

for speaker in speakers:
    selection_json = f"./{speaker}/selection.json"
    print(speaker, end=" ")
    with open(selection_json) as file:
        print(len(file.readlines()))
    