In [19]:
import pandas as pd
import json
import glob
from collections import Counter
from typing import List, Dict

In [20]:
pd.set_option('display.max_colwidth', None)

def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

In [21]:
def get_library_classes(library_name: str, library_dir: str, project_dir: str) -> pd.DataFrame:
    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    # Get Most used Class
    classes = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, value in module_data.items():
                            if key[0].isupper():
                                class_name = key.split("_")[0]
                                if "base_class_0" in value:
                                    continue
                                classes.append(class_name)

    class_data = Counter(classes)
    df_classes = pd.DataFrame.from_dict(class_data, orient="index").reset_index()
    df_classes = df_classes.rename(columns={'index':'Class', 0:'Count'})
    df_classes = df_classes.sort_values(by=['Count'], ascending=False)

    # Get Number of API Options
    class_options = []
    classes = df_classes["Class"].to_list()

    for ml_class in classes:
        try:
            class_data = next(filter(lambda x: x["name"] == ml_class, library_data))
            class_options.append(len(class_data["params"]))
        except StopIteration:
            print("Could not find: ", ml_class)
            continue
            #raise StopIteration()

    df_classes["#HP"] = class_options

    # Compute average number of options used per class and most used option
    avg_class_options = []
    most_used_class_option = []

    for ml_class in classes:
        avg_class_options_used = []
        class_options = []
        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if module_name[0].isupper():
                                    name = module_name.split("_")[0]
                                    if ml_class == name:
                                        counter = 0

                                        if len(data.keys()) == 1 and "**kwargs" in data:
                                            print(project)
                                            print(file)
                                            print(data)

                                        for param in data.keys():
                                            if param == "variable":
                                                continue

                                            if not param == "variable" and not param == "params":
                                                counter += 1

                                            if param == "params":
                                                class_options.append("default")
                                            else:
                                                class_options.append(param)
            
                                        avg_class_options_used.append(counter)
                                
        avg_class_options.append(round((sum(avg_class_options_used) / len(avg_class_options_used)),2))
        class_option_data = Counter(class_options)
        try:
            most_common_number = class_option_data.most_common(1)[0][1]
            options = [elem[0] for elem in class_option_data.most_common() if elem[1] == most_common_number]
            option_str = ", ".join(options)
            most_used_class_option.append(option_str)
        except IndexError:
            most_used_class_option.append("None")

    df_classes["AvgOptionsUsed"] = avg_class_options
    df_classes["Most Used HP"] = most_used_class_option
    
    return df_classes


#df_sklearn_classes = get_library_classes("sklearn", "modules/sklearn_default_values.json" , "statistics/*")
#df_sklearn_classes = df_sklearn_classes[:10]
#df_tf_classes = get_library_classes("tensorflow", "modules/tensorflow_default_values.json" , "statistics/*")
#df_tf_classes = df_tf_classes[:10]
#df_pytorch_classes = get_library_classes("torch", "modules/torch_default_values.json" , "statistics/*")
#df_pytorch_classes = df_pytorch_classes[:10]
#df_all_classes = pd.concat([df_sklearn_classes, df_tf_classes, df_pytorch_classes])

#df_sklearn = get_library_classes("sklearn", "modules/sklearn_ml_algorithms.json" , "statistics/*")
#df_sklearn = df_sklearn[:30]


#df_to_latex(df=df_sklearn)

In [22]:
def get_library_methods(library_name: str, library_dir: str, project_dir: str) -> pd.DataFrame:
    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    methods = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, _ in module_data.items():
                            if key[0].islower():
                                method_name_parts = key.split("_")[:-1]
                                method_name = "_".join(method_name_parts)
                                methods.append(method_name)

    method_data = Counter(methods)
    df_methods = pd.DataFrame.from_dict(method_data, orient="index").reset_index()
    df_methods = df_methods.rename(columns={'index':'Method', 0:'Count'})
    df_methods = df_methods.sort_values(by=['Count'], ascending=False)

    # Compute number of args that can be set regarding the API data
    method_options = []
    methods = df_methods["Method"].to_list()

    for method in methods:
        try:
            method_data = next(filter(lambda x: x["name"] == method, library_data))
            method_options.append(len(method_data["params"]))
        except StopIteration:
            print("Could not find: ", method, library_name)
            method_options.append(0)
            continue

    df_methods["#Args"] = method_options

    # Compute average number of args used per method
    avg_method_args = []
    most_used_method_args = []

    for method in methods:
        avg_method_args_used = []
        method_args = []
        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if module_name[0].islower():
                                    method_name_parts = module_name.split("_")[:-1]
                                    method_name = "_".join(method_name_parts)
                                    if method == method_name:
                                        counter = 0
                                        for arg in data.keys():
                                            if arg == "variable":
                                                continue

                                            if not arg == "variable" and not arg == "params":
                                                counter += 1

                                            if arg == "params":
                                                method_args.append("default")
                                            else:
                                                method_args.append(arg)
                                        
                                        avg_method_args_used.append(counter)
                                
        avg_method_args.append(round((sum(avg_method_args_used) / len(avg_method_args_used)),2))
        method_arg_data = Counter(method_args)
        try:
            most_common_num = method_arg_data.most_common(1)[0][1]
            args = [x[0] for x in method_arg_data.most_common() if x[1] == most_common_num]
            arg_str = ", ".join(args)
            most_used_method_args.append(arg_str)
        except IndexError:
            most_used_method_args.append("None")

    df_methods["AvgArgsUsed"] = avg_method_args
    df_methods["Most Used Args"] = most_used_method_args

    return df_methods

#df_sklearn_methods = get_library_methods("sklearn", "modules/sklearn_default_values.json", "statistics/*")
#df_sklearn_methods = df_sklearn_methods[:10]
#df_tf_methods = get_library_methods("tensorflow", "modules/tensorflow_default_values.json", "statistics/*")
#df_tf_methods = df_tf_methods[:10]
#df_torch_methods = get_library_methods("torch", "modules/torch_default_values.json", "statistics/*")
#df_torch_methods = df_torch_methods[:10]

#df_all_methods = pd.concat([df_sklearn_methods, df_tf_methods, df_torch_methods])

#df_to_latex(df=df_all_methods)


In [23]:
def compare_api_and_project_classes(library_names: List, library_data_dirs: List, project_dir: str):
    api_class_count = []
    project_class_count = []

    for library_data_dir in library_data_dirs:
        class_counter = 0
        with open(library_data_dir, "r", encoding="utf-8") as library_file:
            library_data = json.load(library_file)

        for module in library_data:
            if module["name"][0].isupper():
                class_counter += 1

        api_class_count.append(class_counter)


    for library_name in library_names:
        class_counter = 0
        classes_visited = []
        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for key, value in module_data.items():
                                if key[0].isupper():
                                    class_name = key.split("_")[0]
                                    if "base_class_0" in value:
                                        continue
                                    if class_name not in classes_visited:
                                        class_counter += 1
                                        classes_visited.append(class_name)
        
        project_class_count.append(class_counter)

    
    df = pd.DataFrame()
    df["ML Library"] = library_names
    df["API Class Count"] = api_class_count
    df["Project Class Count"] = project_class_count
    return df     

#library_data_dirs = ["modules/sklearn_default_values.json", "modules/tensorflow_default_values.json",  "modules/torch_default_values.json"]
#library_names = ["sklearn", "tensorflow", "torch"]
#df_class_count = compare_api_and_project_classes(library_names, library_data_dirs, "statistics/*")
#df_class_count.head()
#df_class_count.to_latex(index=False)


In [24]:
def get_hyperparameter_count(library_name: str, library_dir: str, project_dir: str) -> pd.DataFrame:
    total_count = 0
    default_count = 0
    customized_count = 0
    not_match_api_count = 0

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for module_name, module_value in module_data.items():
                            if module_name[0].isupper():
                                class_name = module_name.split("_")[0]
                                if "base_class_0" in module_value:
                                    continue
                                #print("Module Name: ", class_name)
                                for param, param_value in module_value.items():
                                    if param in ("variable", "params"):
                                        continue
                                    else:
                                        total_count += 1

                                        api_module = next(filter(lambda x: x["name"] == class_name, library_data))

                                        if param in api_module["params"]:
                                            api_param_value = api_module["params"].get(param)
                                            if api_param_value == param_value["value"]:
                                                default_count +=1
                                            else:
                                                customized_count += 1
                                        else:
                                            not_match_api_count += 1

    #assert total_count == default_count + customized_count + not_match_api_count                                        

    data = {
        "Library": [library_name],
        "Total Count": [total_count],
        "Default Count": [default_count],
        "Customized Count": [customized_count],
        "Not API Match Count": [not_match_api_count]
    }

    df = pd.DataFrame(data)
    return df


#df_sklearn = get_hyperparameter_count("sklearn", "modules/sklearn_default_values.json", "statistics/*")
#df_tf = get_hyperparameter_count("tensorflow", "modules/tensorflow_default_values.json", "statistics/*")
#df_torch = get_hyperparameter_count("torch", "modules/torch_default_values.json", "statistics/*")

#df_all = pd.concat([df_sklearn, df_tf, df_torch])
#df_all.head()
#df_all.to_latex(index=False)

In [25]:
def get_ml_algorithms(library_name: str, library_dir: str, project_dir: str) -> pd.DataFrame:
    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    # Get Most used Class
    classes = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, value in module_data.items():
                            if key[0].isupper():
                                class_name = key.split("_")[0]
                                if "base_class_0" in value:
                                    continue
                                for item in library_data:
                                    if item["name"] == class_name:
                                        classes.append(class_name)

    class_data = Counter(classes)
    df_classes = pd.DataFrame.from_dict(class_data, orient="index").reset_index()
    df_classes = df_classes.rename(columns={'index':'Class', 0:'Count'})
    df_classes = df_classes.sort_values(by=['Count'], ascending=False)

    # Get Number of API Options
    class_options = []
    classes = df_classes["Class"].to_list()

    for ml_class in classes:
        try:
            class_data = next(filter(lambda x: x["name"] == ml_class, library_data))
            class_options.append(len(class_data["params"]))
        except StopIteration:
            print("Could not find: ", ml_class)
            continue
            #raise StopIteration()

    df_classes["#HP"] = class_options

    # Compute average number of options used per class and most used option
    avg_class_options = []
    most_used_class_option = []

    for ml_class in classes:
        avg_class_options_used = []
        class_options = []
        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if module_name[0].isupper():
                                    name = module_name.split("_")[0]
                                    if ml_class == name:
                                        counter = 0

                                        if len(data.keys()) == 1 and "**kwargs" in data:
                                            print(project)
                                            print(file)
                                            print(data)

                                        for param in data.keys():
                                            if param == "variable":
                                                continue

                                            if not param == "variable" and not param == "params":
                                                counter += 1

                                            if param == "params":
                                                class_options.append("default")
                                            else:
                                                class_options.append(param)
            
                                        avg_class_options_used.append(counter)
                                
        avg_class_options.append(round((sum(avg_class_options_used) / len(avg_class_options_used)),2))
        class_option_data = Counter(class_options)
        try:
            most_common_number = class_option_data.most_common(1)[0][1]
            options = [elem[0] for elem in class_option_data.most_common() if elem[1] == most_common_number]
            option_str = ", ".join(options)
            most_used_class_option.append(option_str)
        except IndexError:
            most_used_class_option.append("None")

    df_classes["AvgOptionsUsed"] = avg_class_options
    df_classes["Most Used HP"] = most_used_class_option
    
    return df_classes



df_sklearn = get_ml_algorithms("sklearn", "modules/sklearn_ml_algorithms.json", "statistics/*")
df_sklearn = df_sklearn[:30]

df_to_latex(df=df_sklearn)

\begin{tabular}{lrrrl}
\toprule
                        Class &  Count &  \#HP &  AvgOptionsUsed &                                                                 Most Used HP \\
\midrule
           LogisticRegression &     40 &   15 &            1.12 &                                                                      default \\
                       KMeans &     30 &    9 &            2.30 &                                                                   n\_clusters \\
             LinearRegression &     13 &    5 &            0.08 &                                                                      default \\
                          PCA &     12 &    9 &            1.00 &                                                                 n\_components \\
                          SVC &      7 &   15 &            2.00 &                                                                       kernel \\
             NearestNeighbors &      7 &    8 &            1.29 &               