In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from pprint import pprint
from utils.dataset import all_datasets, update_config, BUDGET_TO_DURATION, get_accent
from utils.utils import load_features, read_lines, build_kernel, build_SMI_OBJ, maximise_SMI


In [None]:
def smooth(lst, smoothing_factor=0.1):
    ans = [0 for _ in range(len(lst))]
    ans[0] = lst[0]
    cumm = lst[0]
    for ind, val in enumerate(lst[1:], start=1):
        ans[ind] = smoothing_factor * cumm + (1 - smoothing_factor) * lst[ind]
        cumm = ans[ind]
    return ans

In [None]:
def derivative(lst):
    ans = np.array(lst[1:]) - np.array(lst[:-1])
    return ans
    

In [None]:
def plot_gains(gains, accents):
    assert(len(gains) == len(accents))
    indices = list(range(len(gains)))
    smooth_gains = smooth(gains)

    # print(gains[:20])
    # print(smooth_gains[:20])

    derv = derivative(smooth_gains)
    print(derv[:20])

    # plt.scatter(x=range(len(derv)), y=derv, s=1, label="derivative")
    plt.scatter(x=indices, y=gains, s=1, label="original-gains")
    plt.scatter(x=indices, y=smooth_gains, s=1, label="smooth-gains")
    plt.legend()
    plt.show()

#     features = []
#     labels = []
#     for ind, accent in enumerate(config["all_accents"]):
#         selection_JSON_PATH = os.path.join(
#             config["FULL_DATASET_PATH"], accent, "selection.json"
#         )
#         curr_features = load_features(
#             read_lines(selection_JSON_PATH),
#             config["dataset"],
#             config["FULL_DATASET_PATH"],
#             config["feature_type"],
#         )
#         features.append(curr_features)
#         labels.extend([accent for _ in range(len(curr_features))])

#     features = np.concatenate(features, axis=0)
#     labels = labels

#     feature_cols = [f"dim_{_}" for _ in range(features.shape[1])]
#     print(feature_cols)
#     print(features.shape)
#     # df = pd.DataFrame(features, columns=feature_cols)
#     # df["label"] = labels
#     # perp = 40
#     # iters = 1000
#     # tsne = TSNE(n_components=2, verbose=1, perplexity=perp, n_iter=iters)
#     # tsne_results = tsne.fit_transform(df[feature_cols].values)
#     # df["tsne-2d-one"] = tsne_results[:, 0]
#     # df["tsne-2d-two"] = tsne_results[:, 1]

#     # fig = plt.figure(figsize=(6, 5))
#     # _ax = fig.add_subplot(1, 1, 1)

#     # # palette = np.array()

#     # g = sns.scatterplot(
#     #     x="tsne-2d-one",
#     #     y="tsne-2d-two",
#     #     hue="label",
#     #     palette=sns.color_palette("bright", len(config["all_accents"])),
#     #     data=df,
#     #     # data=df.iloc[list(range(len(X_ground)))],
#     #     legend="full",
#     #     alpha=0.6,
#     #     ax=_ax,
#     # )

#     # g.legend(
#     #     loc="upper left",
#     #     bbox_to_anchor=(1.05, 1),
#     #     prop={"size": 7},
#     #     markerscale=1.1,
#     # )

#     # fig.tight_layout()
#     # plt.show()


In [None]:
def sample_global_TSS(config):
    if not config["sample"]:
        return

    print("sampling global TSS, target = {}".format(config["target_accent"]))

    ground_list = []
    for acc in config["all_accents"]:
        JSON_PATH = os.path.join(config["FULL_DATASET_PATH"], acc, "selection.json")
        ground_list.extend(read_lines(JSON_PATH)[::config["downsample_ratio"]])
    ground_features = load_features(
        ground_list, config["dataset"], config["FULL_DATASET_PATH"], config["feature"]
    )

    QUERY_JSON_PATH = os.path.join(
        config["FULL_DATASET_PATH"],
        config["target_directory_path"],
        config["target_accent"],
        "seed.json",
    )
    query_list = read_lines(QUERY_JSON_PATH)[: config["target"]]
    query_features = load_features(
        query_list, config["dataset"], config["FULL_DATASET_PATH"], config["feature"]
    )

    print(
        "ground_list: ",
        Counter([get_accent(line, config["dataset"]) for line in ground_list]),
    )
    print(
        "query_list: ",
        Counter([get_accent(line, config["dataset"]) for line in query_list]),
    )

    print("Building kernels")
    ground_ground_kernel = build_kernel(ground_features, ground_features, config["sim"])
    print("ground_ground_kernel.shape: ", ground_ground_kernel.shape)
    query_ground_kernel = build_kernel(query_features, ground_features, config["sim"])
    print("query_ground_kernel.shape: ", query_ground_kernel.shape)
    query_query_kernel = build_kernel(query_features, query_features, config["sim"])
    print("query_query_kernel.shape: ", query_query_kernel.shape)
    print("Building SMI objects")
    SMI_obj = build_SMI_OBJ(
        ground_ground=ground_ground_kernel,
        query_ground=query_ground_kernel,
        query_query=query_query_kernel,
        fxn=config["fxn"],
        eta=config["eta"],
    )
    print("Maximising SMI objective")
    SMI_output = maximise_SMI(SMI_obj, budget=config["budget"])
    SMI_indices = [_[0] for _ in SMI_output]
    SMI_lines = [ground_list[index] for index in SMI_indices]
    accents = [get_accent(line, config["dataset"]) for line in SMI_lines]
    SMI_gains = [_[1] for _ in SMI_output]
    plot_gains(SMI_gains, accents)

In [None]:
config = {
    "dataset": "INDIC",
    "server": "SWARA",
    "sample": True,
    "target_accent": "assamese-hindi::1-1",
    "feature": "MFCC",
    "target_directory_path": "mixed",
    "target": 20,
    "sim": "euclidean",
    "fxn": "GCMI",
    "eta": 1.0,
    "budget": 20000,
    "downsample_ratio": 1,
}
config = update_config(config)
sample_global_TSS(config)

In [None]:
ls = [1, 2, 3, 4, 5]