In [8]:
import json
import glob
import pandas as pd
import warnings
from collections import Counter
from typing import List

pd.set_option('display.max_colwidth', None)

warnings.simplefilter(action='ignore', category=FutureWarning)

with open("../data/repos_hyperparameter_tuning.json", "r", encoding="utf-8") as src:
    repo_files = json.load(src)


def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

def get_algorithms(library_name: str, library_dir: str, project_dir: str) -> pd.DataFrame:
    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    # Get Most used Class
    classes = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)


            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, value in module_data.items():
                            if key[0].isupper():
                                class_name = key.split("_")[0]
                                for item in library_data:
                                    if item["name"] == class_name:
                                        classes.append(class_name)


    class_data = Counter(classes)
    df_classes = pd.DataFrame.from_dict(class_data, orient="index").reset_index()
    df_classes = df_classes.rename(columns={'index':'Class', 0:'Count'})
    df_classes = df_classes.sort_values(by=['Count'], ascending=False)

    # Get Number of API Options and category
    categories = []
    class_options = []
    classes = df_classes["Class"].to_list()

    for ml_class in classes:
        try:
            class_data = next(filter(lambda x: x["name"] == ml_class, library_data))
            class_options.append(len(class_data["params"]))
            category = class_data["full_name"].split(".")[1]
            categories.append(category)
        except StopIteration:
            print("Could not find: ", ml_class)
            continue
            #raise StopIteration()

    df_classes["Category"] = categories
    df_classes["#HP"] = class_options

    # Compute average number of options used per class and most used option
    avg_class_options = []
    most_used_class_option = []
    # compute how often a algorithm is initialized with and without params
    init_with_params = []
    init_without_params = []

    for ml_class in classes:
        with_params = 0
        without_params = 0
        avg_class_options_used = []
        class_options = []

        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if module_name[0].isupper():
                                    name = module_name.split("_")[0]
                                    if ml_class == name:
                                        counter = 0

                                        if "params" in data:

                                            if library_name == "torch":
                                                value_data = data["params"]
                                                if "default" == value_data["value"]:
                                                    without_params += 1
                                                else:
                                                    with_params += 1
                                            else:
                                                without_params += 1
                                        else:
                                            with_params += 1

                                        for param in data.keys():
                                            if param == "variable":
                                                continue

                                            if param not in ("variable", "params", "class"):
                                                counter += 1

                                            if param == "params":
                                                class_options.append("default")
                                            else:
                                                class_options.append(param)
            
                                        avg_class_options_used.append(counter)


        init_with_params.append(with_params)
        init_without_params.append(without_params)                  
        avg_class_options.append(round((sum(avg_class_options_used) / len(avg_class_options_used)),2))
        class_option_data = Counter(class_options)
        try:
            #most_common_number = class_option_data.most_common(1)[0][1]
            #options = [elem[0] for elem in class_option_data.most_common() if elem[1] == most_common_number]
            #option_str = ", ".join(options)
            most_used_class_option.append(class_option_data.most_common(1)[0][0])
        except IndexError:
            most_used_class_option.append("None")

    df_classes["Init With Params"] = init_with_params
    df_classes["Init Without Params"] = init_without_params
    df_classes["AvgOptionsUsed"] = avg_class_options
    df_classes["Most Used HP"] = most_used_class_option

    df_classes = df_classes[:5]
    df_classes = df_classes[["Class", "Count", "Init With Params", "Init Without Params", "#HP", "AvgOptionsUsed"]]
    df_classes = df_classes.sort_values(by=['Count'], ascending=False)

    return df_classes

df_sklearn = get_algorithms("sklearn", "../modules/sklearn_estimators.json", "../data/statistics/*")
df_tf = get_algorithms("tensorflow", "../modules/tensorflow_optimizer.json", "../data/statistics/*")
df_pytorch = get_algorithms("torch", "../modules/torch_optimizer.json", "../data/statistics/*")

df_all = pd.concat([df_sklearn, df_tf, df_pytorch])

df_to_latex(df=df_all)

\begin{tabular}{lrrrrr}
\toprule
                 Class &  Count &  Init With Params &  Init Without Params &  \#HP &  AvgOptionsUsed \\
\midrule
                KMeans &    134 &               134 &                    0 &    9 &            2.28 \\
    LogisticRegression &    124 &                94 &                   30 &   15 &            2.40 \\
      LinearRegression &     85 &                23 &                   62 &    5 &            0.36 \\
                   SVC &     65 &                50 &                   15 &   15 &            1.48 \\
RandomForestClassifier &     58 &                46 &                   12 &   18 &            2.34 \\
                  Adam &    265 &               236 &                   29 &   14 &            1.29 \\
                   SGD &     28 &                25 &                    3 &   13 &            1.79 \\
               RMSprop &     13 &                 6 &                    7 &   14 &            0.77 \\
                Adamax &      