In [None]:
import math
import os
from cmath import inf
from collections import Counter
from glob import glob

import numpy as np
import pandas as pd
import regex as re
from utils.dataset import Dataset, get_accent, get_path, update_config
from utils.utils import read_lines

In [None]:
def WER_test_file(test_file):
    try:
        txt_file = open(test_file, "r")
        lines = txt_file.readlines()
        matched = ""
        for line in lines:
            if "==========>>>>>>Evaluation Greedy WER: " in line:
                txt_file.close()
                return float(line.rstrip().split(": ")[1])
    except:
        # txt_file.close()
        print(test_file)
        print("weiowdnio")
        return inf


In [None]:
def CER_test_file(test_file):
    try:
        txt_file = open(test_file, "r")
        lines = txt_file.readlines()
        matched = ""
        for line in lines:
            if "==========>>>>>>Evaluation Greedy CER: " in line:
                txt_file.close()
                return float(line.rstrip().split(": ")[1])
    except:
        # txt_file.close()
        print(test_file)
        print("weiowdnio")
        return inf

In [None]:
def get_counts(path, dataset_name):
    return str(Counter([get_accent(line, dataset_name)[:3] for line in read_lines(path)]))

In [None]:
def get_accent_speech(path, accent, dataset_name):
    lines = read_lines(path)
    total_duration = round(np.sum([line["duration"] for line in lines]), 2)
    accented_duration = round(np.sum([line["duration"] for line in lines if get_accent(line, dataset_name) == accent]), 2)
    return f"{accented_duration}/{total_duration}", f"{round(100 * accented_duration/total_duration, 2)}%"   

In [None]:
def collect_global_random(config):
    expt_results = []
    glob_str = f'{os.path.join(config["FULL_DATASET_PATH"], "*", "results", "budget_*", "global_random")}'
    # print(glob_str)
    lst = glob(glob_str) 

    for setting in lst:
        result = {}
        path = setting
        path = path.replace(f'{config["FULL_DATASET_PATH"]}', "").strip(os.path.sep)
        result["accent"] = path.split(os.path.sep)[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "results"
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        budget = path.split(os.path.sep)[0].replace("budget_", "")
        result["budget"] = budget
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "global_random"
        result["method"] = path.split(os.path.sep[0])[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        for run in range(1, 4):
            test_path = os.path.join(setting, f"run_{run}", "test_infer_log.txt")
            result[f"WER-{run}"] = WER_test_file(test_path)
            result[f"CER-{run}"] = CER_test_file(test_path)
            json_path = os.path.join(setting, f"run_{run}", "train.json")
            result[f"duration-{run}"], result[f"percent-{run}"] = get_accent_speech(json_path, result["accent"], dataset_name=config["dataset"])
            result[f"counter-{run}"] = get_counts(json_path, dataset_name=config["dataset"])
        # print(result)
        
        expt_results.append(result)
    return expt_results

In [None]:
def collect_global_entropy(config):
    expt_results = []
    glob_str = f'{os.path.join(config["FULL_DATASET_PATH"], "*", "results", "budget_*", "global_entropy", "agg_*")}'
    # print(glob_str)
    lst = glob(glob_str) 

    for setting in lst:
        result = {}
        path = setting
        path = path.replace(f'{config["FULL_DATASET_PATH"]}', "").strip(os.path.sep)
        result["accent"] = path.split(os.path.sep)[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "results"
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        budget = path.split(os.path.sep)[0].replace("budget_", "")
        result["budget"] = budget
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "global_entropy"
        result["method"] = path.split(os.path.sep[0])[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        while path:
                s = path.split(os.path.sep)[0]
                result[s.split('_')[0]] = s.split('_')[1]
                path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        for run in range(1, 4):
            try:
                test_path = os.path.join(setting, f"run_{run}", "test_infer_log.txt")
                result[f"WER-{run}"] = WER_test_file(test_path)
                result[f"CER-{run}"] = CER_test_file(test_path)
                json_path = os.path.join(setting, f"run_{run}", "train.json")
                result[f"duration-{run}"], result[f"percent-{run}"] = get_accent_speech(json_path, result["accent"], dataset_name=config["dataset"])
                result[f"counter-{run}"] = get_counts(json_path, dataset_name=config["dataset"])
            except:
                continue
        # print(result)
        
        expt_results.append(result)
    return expt_results

In [None]:
def collect_global_SM(config):
    expt_results = []
    glob_str = f'{os.path.join(config["FULL_DATASET_PATH"], "*", "results", "budget_*", "global-SM", "fxn_*", "feature_*", "sim_*", "lambdaVal_*")}'
    # print(glob_str)
    lst = glob(glob_str) 

    for setting in lst:
        result = {}
        path = setting
        path = path.replace(f'{config["FULL_DATASET_PATH"]}', "").strip(os.path.sep)
        result["accent"] = path.split(os.path.sep)[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "results"
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        budget = path.split(os.path.sep)[0].replace("budget_", "")
        result["budget"] = budget
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "global-SM"
        result["method"] = path.split(os.path.sep[0])[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        while path:
            s = path.split(os.path.sep)[0]
            result[s.split('_')[0]] = s.split('_')[1]
            path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        for run in range(1, 4):
            test_path = os.path.join(setting, f"run_{run}", "test_infer_log.txt")
            result[f"WER-{run}"] = WER_test_file(test_path)
            result[f"CER-{run}"] = CER_test_file(test_path)
            json_path = os.path.join(setting, f"run_{run}", "train.json")
            result[f"duration-{run}"], result[f"percent-{run}"] = get_accent_speech(json_path, result["accent"], dataset_name=config["dataset"])
            result[f"counter-{run}"] = get_counts(json_path, dataset_name=config["dataset"])

        # print(result)
        expt_results.append(result)
    return expt_results

In [None]:
def collect_TSS_mixed_results(config):
    expt_results = []
    glob_str = f'{os.path.join(config["FULL_DATASET_PATH"], "*", "results", "budget_*", "mixed", "*", "global-TSS", "target_*", "fxn_*", "feature_*", "sim_*", "eta_*")}'
    # print(glob_str)
    lst = glob(glob_str) 

    for setting in lst:
        result = {}
        path = setting
        # print(path)
        path = path.replace(f'{config["FULL_DATASET_PATH"]}', "").strip(os.path.sep)
        result["accent"] = path.split(os.path.sep)[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}/', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "results"
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        budget = path.split(os.path.sep)[0].replace("budget_", "")
        result["budget"] = budget
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "mixed"
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        result["query_accent"] = path.split(os.path.sep[0])[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "global-TSS"
        result["method"] = path.split(os.path.sep[0])[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        while path:
            s = path.split(os.path.sep)[0]
            result[s.split('_')[0]] = s.split('_')[1]
            path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        for run in range(1, 4):
            test_path = os.path.join(setting, f"run_{run}", "test_infer_log.txt")
            result[f"WER-{run}"] = WER_test_file(test_path)
            result[f"CER-{run}"] = CER_test_file(test_path)
            json_path = os.path.join(setting, f"run_{run}", "train.json")
            result[f"duration-{run}"], result[f"percent-{run}"] = get_accent_speech(json_path, result["accent"], dataset_name=config["dataset"])
            result[f"counter-{run}"] = get_counts(json_path, dataset_name=config["dataset"])

        # print(result)
        expt_results.append(result)
    return expt_results

In [None]:
def collect_two_stage_TSS_error_results(config):
    expt_results = []
    glob_str = f'{os.path.join(config["FULL_DATASET_PATH"], "*", "results", "budget_*", "mixed", "*", "budget_*", "global-TSS", "target_*", "fxn_*", "feature_*", "sim_*", "eta_*", "budget_*", "error_model", "trainBudget_*", "mode_*", "pseudoTrans_*")}'
    # print(glob_str)
    lst = glob(glob_str) 
    # print(lst)

    for setting in lst:
        result = {}
        path = setting
        # print(path)
        path = path.replace(f'{config["FULL_DATASET_PATH"]}', "").strip(os.path.sep)
        result["accent"] = path.split(os.path.sep)[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}/', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "results"
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        budget = path.split(os.path.sep)[0].replace("budget_", "")
        result["budget(Stage2)"] = budget
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "mixed"
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        result["query_accent"] = path.split(os.path.sep[0])[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        result["budget(Stage1)"] = path.split(os.path.sep[0])[0].split('_')[1]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        assert path.split(os.path.sep[0])[0] == "global-TSS"
        result["method"] = path.split(os.path.sep[0])[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)



        while path and path.split(os.path.sep)[0]!="error_model":
            s = path.split(os.path.sep)[0]
            result[s.split('_')[0]] = s.split('_')[1]
            path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

        result["method(Stage2)"] = path.split(os.path.sep[0])[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        while path and path.split(os.path.sep)[0]!="error_model":
            s = path.split(os.path.sep)[0]
            result[s.split('_')[0]+"(Stage2)"] = s.split('_')[1]
            path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)


        for run in range(1, 4):
            try:
                test_path = os.path.join(setting, f"run_{run}", "test_infer_log.txt")
                result[f"WER-{run}"] = WER_test_file(test_path)
                result[f"CER-{run}"] = CER_test_file(test_path)
                json_path = os.path.join(setting, f"run_{run}", "train.json")
                result[f"duration-{run}"], result[f"percent-{run}"] = get_accent_speech(json_path, result["accent"], dataset_name=config["dataset"])
                result[f"counter-{run}"] = get_counts(json_path, dataset_name=config["dataset"])
            except:
                continue

        # print(result)
        expt_results.append(result)
    return expt_results

In [None]:
def collect_pretrain_results(config):
    expt_results = []
    glob_str = f'{os.path.join(config["FULL_DATASET_PATH"], "*", "quartznet_outputs")}'
    # print(glob_str)
    lst = glob(glob_str) 
    # print(lst)

    for setting in lst:
        result = {}
        path = setting
        # print(path)
        path = path.replace(f'{config["FULL_DATASET_PATH"]}', "").strip(os.path.sep)
        result["accent"] = path.split(os.path.sep)[0]
        path = path.replace(f'{path.split(os.path.sep[0])[0]}/', "").strip(os.path.sep)
        assert path.split(os.path.sep[0])[0] == "quartznet_outputs"
        path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)
        result["method"] = "pretrain"
        try:
            test_path = os.path.join(setting, "test_infer_log.txt")
            result[f"WER"] = WER_test_file(test_path)
            result[f"CER"] = CER_test_file(test_path)
        except:
            continue

        # print(result)
        expt_results.append(result)
    # print(expt_results)
    return expt_results

# config = {}

# config["dataset"] = "L2"
# config["server"] = "SWARA"
# config = update_config(config)
# collect_pretrain_results(config)

In [None]:
def collect_mixed_selections(config, query_accent, query_set):
    expt_results = []
    glob_str = f'{os.path.join(config["FULL_DATASET_PATH"], "mixed", f"{query_accent}", "results", "budget_*", "global-TSS", "target_*", "fxn_*", "feature_*", "sim_*", "eta_*")}'
    # print(glob_str)
    lst = glob(glob_str) 
    # print(lst)

    for setting in lst:
        for accent in query_set:
            result = {}
            result["accent"] = accent
            path = setting
            # print(path)
            path = path.replace(f'{config["FULL_DATASET_PATH"]}', "").strip(os.path.sep)
            path = path.replace(f'mixed', "").strip(os.path.sep)

            result["query_accent"] = path.split(os.path.sep[0])[0]
            path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

            assert path.split(os.path.sep[0])[0] == "results"
            path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

            budget = path.split(os.path.sep)[0].replace("budget_", "")
            result["budget"] = budget
            path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

            assert path.split(os.path.sep[0])[0] == "global-TSS"
            result["method"] = path.split(os.path.sep[0])[0]
            path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

            while path:
                s = path.split(os.path.sep)[0]
                result[s.split('_')[0]] = s.split('_')[1]
                path = path.replace(f'{path.split(os.path.sep[0])[0]}', "").strip(os.path.sep)

            for run in range(1, 4):
                try:
                    json_path = os.path.join(setting, f"run_{run}", "train.json")
                    result[f"duration-{run}"], result[f"percent-{run}"] = get_accent_speech(json_path, result["accent"], dataset_name=config["dataset"])
                    result[f"counter-{run}"] = get_counts(json_path, dataset_name=config["dataset"])
                    test_path = os.path.join(setting, f"run_{run}", "test_infer_log.txt")
                    result[f"WER-{run}"] = WER_test_file(test_path)
                    result[f"CER-{run}"] = CER_test_file(test_path)
                except:
                    continue

            # print(result)
            expt_results.append(result)
    return expt_results

In [None]:
def main_selection_results(config):
    selection_results = []
    selection_results.extend(collect_mixed_selections(config, "arabic-spanish::1-1", ["arabic", "spanish"]))
    selection_results.extend(collect_mixed_selections(config, "chinese-vietnamese::1-1", ["chinese", "vietnamese"]))

    for result in selection_results:
        try:
            if("sim" in result): result["sim"] = result["sim"][:3]
            if("method(Stage2)" in result): result["method"] = result["method"] + ("-" + result["method(Stage2)"] if result["method(Stage2)"] else "")
            result["method"] = result["method"].replace("global", "")
            result["method"] = result["method"].strip("-_ ")
        except:
            continue

    temp_df = pd.DataFrame(selection_results, columns=["budget", "query_accent", "method", "fxn", "feature", "sim", "counter-1", "duration-1", "percent-1", "accent"])
    temp_df = temp_df.sort_values(["query_accent", "sim", "fxn", "accent"], ignore_index=True)
    def format_results(query_accent, data):
        mask = (data["query_accent"] == query_accent)
        masked_data = data.loc[mask]
        cols = list(masked_data.columns)
        specific_columns = [_ for _ in cols if _.startswith("WER") or _.startswith("CER") or _.startswith("duration") or _.startswith("percent") or _ == "accent"]
        common_columns = list(set(cols) - set(specific_columns))
        # print(common_columns)
        masked_data = masked_data.pivot(index=common_columns, columns='accent', values=specific_columns).reset_index()
        return masked_data

    format_results("arabic-spanish::1-1", temp_df).to_csv("../../Results/TSS/L2/arabic-spanish-1-1-selections.csv", index=False)
    format_results("chinese-vietnamese::1-1", temp_df).to_csv("../../Results/TSS/L2/chinese-vietnamese-1-1-selections.csv", index=False)


In [None]:
# def format_selection_results(df):

#     def collect_formatted_results(query_set, query_composn, data):
#         query_composn = [str(_) for _ in query_composn]
#         query_accent = "-".join(query_set) + "::" + "-".join(query_composn)
#         mask = ((data["query_accent"] == query_accent) | ((data["query_accent"].isnull()) & (data["accent"].isin(query_set))))
#         masked_data = data.loc[mask]
#         cols = list(masked_data.columns)
#         specific_columns = [_ for _ in cols if _.startswith("WER") or _.startswith("CER") or _.startswith("duration") or _.startswith("percent") or _ == "accent"]
#         common_columns = list(set(cols) - set(specific_columns))
#         # print(common_columns)
#         masked_data = masked_data.pivot(index=common_columns, columns='accent', values=specific_columns).reset_index()
#         return masked_data
    
#     temp_df = df[["accent", "budget", "query_accent", "fxn", "sim","counter-1", "duration-1", "percent-1"]]
#     temp_df = temp_df.sort_values(["query_accent", "sim", "fxn", "accent"], ignore_index=True)
#     collect_formatted_results(["arabic", "spanish"], [1, 1], temp_df).to_csv("../../Results/TSS/L2/arabic-spanish-1-1-selection.csv", index=False)
#     collect_formatted_results(["chinese", "vietnamese"], [1, 1], temp_df).to_csv("../../Results/TSS/L2/chinese-vietnamese-1-1-selection.csv", index=False)

In [None]:
def main_finetune_results(config):
    L2_results = []
    L2_results.extend(collect_global_random(config))
    L2_results.extend(collect_global_SM(config))
    L2_results.extend(collect_TSS_mixed_results(config))
    L2_results.extend(collect_two_stage_TSS_error_results(config))
    L2_results.extend(collect_global_entropy(config))


    for result in L2_results:
        try:
            result["WER-mean"] = round(np.nanmean([result["WER-1"], result["WER-2"], result["WER-3"]]), 3)
            result["WER-std"] = round(np.nanstd([result["WER-1"], result["WER-2"], result["WER-3"]]), 3)
            result["CER-mean"] = round(np.nanmean([result["CER-1"], result["CER-2"], result["CER-3"]]), 3)
            result["CER-std"] = round(np.nanstd([result["CER-1"], result["CER-2"], result["CER-3"]]), 3)
            if("sim" in result): result["sim"] = result["sim"][:3]
            if("method(Stage2)" in result): result["method"] = result["method"] + ("-" + result["method(Stage2)"] if result["method(Stage2)"] else "")
            result["method"] = result["method"].replace("global", "")
            result["method"] = result["method"].strip("-_ ")
        except:
            continue

    df = pd.DataFrame(L2_results)
    df = df.sort_values(["accent", "method"], ignore_index=True)

    return df

In [None]:
def format_finetune_results(df):
    temp_df = df[["accent", "budget", "query_accent", "fxn", "sim","counter-1", "duration-1", "percent-1"]]
    temp_df = temp_df.sort_values(["query_accent", "sim", "fxn", "accent"], ignore_index=True)

    def collect_formatted_results(query_set, query_composn, data):
        query_composn = [str(_) for _ in query_composn]
        query_accent = "-".join(query_set) + "::" + "-".join(query_composn)
        mask = ((data["query_accent"] == query_accent) | ((data["query_accent"].isnull()) & (data["accent"].isin(query_set))))
        masked_data = data.loc[mask]
        cols = list(masked_data.columns)
        specific_columns = [_ for _ in cols if _.startswith("WER") or _.startswith("CER") or _.startswith("duration") or _.startswith("percent") or _ == "accent"]
        common_columns = list(set(cols) - set(specific_columns))
        # print(common_columns)
        masked_data = masked_data.pivot(index=common_columns, columns='accent', values=specific_columns).reset_index()
        return masked_data
    
    collect_formatted_results(["arabic", "spanish"], [1, 1], df).to_csv("../../Results/TSS/L2/arabic-spanish-1-1.csv", index=False)
    collect_formatted_results(["chinese", "vietnamese"], [1, 1], df).to_csv("../../Results/TSS/L2/chinese-vietnamese-1-1.csv", index=False)

In [None]:
config = {}

config["dataset"] = "L2"
config["server"] = "SWARA"
config = update_config(config)
# print(config)

In [None]:
df = main_finetune_results(config)
df

In [None]:
format_finetune_results(df)

In [None]:
main_selection_results(config)