In [12]:
import pandas as pd
import json
import glob
from collections import Counter
from typing import List, Dict
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [13]:
pd.set_option('display.max_colwidth', None)

def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

def get_module(name, data):
    module = next(filter(lambda x: name == x["name"], data))
    return module

In [14]:
def count_all_parameters(library_name: str, library_dir: str, files: str) -> pd.DataFrame:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [module["name"] for module in library_data]

        # Get Most used Class
    classes = []
    total_params_set = 0
    total_params_available = 0
    default_params = 0
    customized_params = 0
    for project in glob.glob(files):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]
                                
                                if class_name not in class_names:
                                    continue

                                library_module_data = get_module(class_name, library_data)
                                library_module_params = library_module_data["params"]
                                total_params_available += len(library_module_params)

                                for name, value in data.items():
                                    if name in ("variable", "params"):
                                        continue
                                    else:
                                        total_params_set += 1
                                        if name in library_module_params.keys():

                                            if str(library_module_params[name]).replace("'", "") == value["value"]:
                                                default_params += 1
                                            else:
                                                customized_params += 1
                                        else:
                                            customized_params += 1


    assert total_params_set == default_params + customized_params

    #print(library_name)
    #print("total params set:", total_params_set)
    #print("total params available:", total_params_available)
    #print("default params: ", default_params)
    #print("custom params: ", customized_params)


    df = pd.DataFrame()
    df["Library"] = [library_name]
    df["Available"] = [total_params_available]
    df["Set"] = [total_params_set]
    df["Default"] = [default_params]
    df["Custom"] = [customized_params]

    return df

# Count params of all methods
df_sklearn = count_all_parameters("sklearn", "../modules/sklearn_estimators.json" , "../data/statistics/*")
df_tf = count_all_parameters("tensorflow", "../modules/tensorflow_optimizer.json" , "../data/statistics/*")
df_pytorch = count_all_parameters("torch", "../modules/torch_optimizer.json" , "../data/statistics/*")
df_all = pd.concat([df_sklearn, df_tf, df_pytorch])

df_to_latex(df=df_all)

\begin{tabular}{lrrrr}
\toprule
   Library &  Available &  Set &  Default &  Custom \\
\midrule
   sklearn &      11573 & 1992 &      360 &    1632 \\
tensorflow &       4446 &  415 &       41 &     374 \\
     torch &      25644 & 6756 &      322 &    6434 \\
\bottomrule
\end{tabular}



In [15]:
def count_ml_method_params(library_name: str, library_dir: str, files: str) -> pd.DataFrame:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [module["name"] for module in library_data]

    total_params_set = 0
    total_params_available = 0
    default_params = 0
    customized_params = 0

    for project in files:
        with open("../data/statistics/" + project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]
                                
                                if class_name not in class_names:
                                    continue

                                library_module_data = get_module(class_name, library_data)
                                library_module_params = library_module_data["params"]
                                total_params_available += len(library_module_params)

                                for name, value in data.items():
                                    if name in ("variable", "params"):
                                        continue
                                    else:
                                        total_params_set += 1
                                        if name in library_module_params.keys():

                                            if str(library_module_params[name]).replace("'", "") == value["value"]:
                                                default_params += 1
                                            else:
                                                customized_params += 1
                                        else:
                                            customized_params += 1


    assert total_params_set == default_params + customized_params

    #print(library_name)
    #print("total params set:", total_params_set)
    #print("total params available:", total_params_available)
    #print("default params: ", default_params)
    #print("custom params: ", customized_params)


    df = pd.DataFrame()
    df["Library"] = [library_name]
    df["Available"] = [total_params_available]
    df["Set"] = [total_params_set]
    df["Default"] = [default_params]
    df["Custom"] = [customized_params]

    return df


# Count params of ML methods
with open("../data/repos_hyperparameter_tuning.json", "r", encoding="utf-8") as src:
    repos_files = json.load(src)

df_sklearn = count_ml_method_params("sklearn", "../modules/sklearn_estimators.json" , repos_files)
df_tf = count_ml_method_params("tensorflow", "../modules/tensorflow_optimizer.json" , repos_files)
df_pytorch = count_ml_method_params("torch", "../modules/torch_optimizer.json" , repos_files)
df_all = pd.concat([df_sklearn, df_tf, df_pytorch])

df_to_latex(df_all)

\begin{tabular}{lrrrr}
\toprule
   Library &  Available &  Set &  Default &  Custom \\
\midrule
   sklearn &       3558 &  622 &      120 &     502 \\
tensorflow &       1547 &  157 &       20 &     137 \\
     torch &       6801 & 1753 &       72 &    1681 \\
\bottomrule
\end{tabular}

