In [33]:
import json
import pandas as pd
import glob
from collections import Counter
from typing import List, Dict
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
statistic_dir = "../data/statistics/"

with open("../data/repos_hyperparameter_tuning.json", "r", encoding="utf-8") as src:
    repos = json.load(src)
    print("Number repos: ", len(repos))


Number repos:  514


In [34]:
def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

def get_module(name, data):
    try:
        return next(filter(lambda x: name == x["name"], data))
    except StopIteration:
        return None

In [35]:
def get_classes(library_name: str, library_dir: str, files: List) -> Dict:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [x["name"] for x in library_data]

    # Get Most used Class
    classes = []

    for project in files:
        with open(statistic_dir + project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, _ in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]

                                if class_name in class_names:
                                    classes.append(class_name)

    return Counter(classes).most_common()

In [36]:
def get_params(library_name, files, classes) -> List:
    param_data = []
    value_data = []
    
    for class_name in classes:
        params_set = []

        for project in list(files):
            with open(statistic_dir + project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for key, data in module_data.items():
                                if key[0].isupper():
                                    module_name_parts = key.split("_")
                                    if len(module_name_parts) > 2:
                                        module_name = "_".join(module_name_parts[:-1])
                                    else:
                                        module_name = module_name_parts[0]

                                    if class_name == module_name:
                                        for name, _ in data.items():
                                            if name in ("variable", "params"):
                                                continue
                                            else:
                                                if name == "lr" and library_name == "tensorflow":
                                                    params_set.append("learning_rate")
                                                else:
                                                    params_set.append(name)
                                                        
                                            
        param_counter = Counter(params_set).most_common(3)
        param_data.append(param_counter)
        
    return param_data

In [37]:
def create_dataframe(top_classes, top_classes_count, top_params, top_params_count):

    df = pd.DataFrame()
    df["Algorithm"] = top_classes
    df["Algorithm Count"] = top_classes_count
    df["Top Hyperparameters"] = top_params
    df["Top Hyperparameter Count"] = top_params_count

    return df


In [38]:
def get_all_data(library_name: str, library_data: str, files: List[str]) -> pd.DataFrame:
    # get all estimator classes
    classes = get_classes(library_name, library_data, repos)
    # get top ten estimator classes
    top_classes = [x[0] for x in classes][:5]
    # get count for the top ten estimator classes
    top_classes_count = [x[1] for x in classes][:5]

    # get top three hyperparameter for each class
    params = get_params(library_name, repos, top_classes)

    top_params = []  
    top_params_count = []  
    for item in params:
        _params = [x[0] for x in item]
        _count = [x[1] for x in item]
        top_params.append(_params)
        top_params_count.append(_count)

    return create_dataframe(top_classes, top_classes_count, top_params, top_params_count)

df_sklearn = get_all_data("sklearn", "../data/library_data/sklearn_estimators.json", repos)
df_tensorflow = get_all_data("tensorflow", "../data/library_data/tensorflow_optimizer.json", repos)
df_pytorch = get_all_data("torch", "../data/library_data/torch_optimizer.json", repos)
df_all = pd.concat([df_sklearn, df_tensorflow, df_pytorch])

#print(df_all.to_latex(index=False))

In [39]:
def get_method_data(df: pd.DataFrame):

    method_data = []

    for _, row in df.iterrows():
        params = []
        name = row[0]
        count = row[1]
        for (x, y) in zip(row[2], row[3]):
            params.append({"name": x, "count": y})

        
        method_data.append({"name": name, "count": count, "params": params})
    
    return method_data


sklearn_method_data = get_method_data(df_sklearn)
tensorflow_method_data = get_method_data(df_tensorflow)
torch_method_data = get_method_data(df_pytorch)



In [40]:
def prepare_library_data(library_name, library_dir, data, files):

    tmp_data = []


    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [x["name"] for x in library_data]

    for item in data:
        class_name = item["name"]
        param_data = []
        for param in item["params"]:
            param_name = param["name"]
            param_values = []
            
            
            for project in list(files):
                with open(statistic_dir + project, "r", encoding="utf-8") as project_file:
                    project_data = json.load(project_file)

                    for file in project_data.keys():
                        file_data = project_data[file]
                        for library in file_data.keys():
                            if library == library_name:
                                module_data = file_data[library]
                                for key, data in module_data.items():
                                    if key[0].isupper():
                                        module_name_parts = key.split("_")
                                        if len(module_name_parts) > 2:
                                            module_name = "_".join(module_name_parts[:-1])
                                        else:
                                            module_name = module_name_parts[0]

                                        if class_name == module_name:
                                            for name, data in data.items():
                                                if name in ("variable", "params"):
                                                    continue
                                                else:
                                                    if name == "lr" and library_name == "tensorflow":
                                                        #print("Class, Param, Data: ", class_name, name, data)
                                                        param_values.append(data)
                                                    else:
                                                        if name == param_name:
                                                            param_values.append(data)
                                                            #print("Class, Param, Data: ", class_name, name, data)
            param_data.append({"name": param_name, "data": param_values})
        tmp_data.append({"name": class_name, "param_data": param_data})

    return tmp_data

sklearn_data = prepare_library_data("sklearn", "../data/library_data/sklearn_estimators.json", sklearn_method_data, repos)
tensorflow_data = prepare_library_data("tensorflow", "../data/library_data/tensorflow_optimizer.json", tensorflow_method_data, repos)
torch_data = prepare_library_data("torch", "../data/library_data/torch_optimizer.json", torch_method_data, repos)



In [41]:
def check_possible_values(param_name, options, possible_values) -> bool:
    default_data = []
    
    if not possible_values:
        return False

    for value in possible_values:
        if value[0] == options[param_name].replace('"', "").replace("'", ""):
            default_data.append("default")
        else:
            default_data.append("customied")

    most_common = Counter(default_data).most_common(1)[0][0]

    return True if most_common == "default" else False


In [42]:


def get_param_type(library_name, library_dir, data) -> List[List]:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
    
    type_data = []
    default_data = []

    for item in data:
        class_name = item["name"]
        class_type_data = []
        class_default_data = []
        for param in item["param_data"]:
            types = []
            default = []
            param_name = param["name"]
            modulue_data = get_module(class_name, library_data)
            options = modulue_data["params"]

            for x in param["data"]:
                types.append(x["type"])

                value = x["value"]

                if not options[param_name]:
                    if value == options[param_name]:
                        default.append("default")
                    else:
                        default.append("customized")
                elif value == options[param_name].replace('"', "").replace("'", ""):
                    default.append("default")
                    #print("Equal: ", class_name, param_name, value, options[param_name], x["type"])
                else:
                    #print("customized", class_name, param_name, value, options[param_name], x["type"])   
                    if check_possible_values(param_name, options, x["possible_values"]):
                        default.append("default")
                    else:
                        default.append("customized") 

            most_common_type = Counter(types).most_common(1)[0][0]
            most_common_default = Counter(default).most_common(1)[0][0]

            class_type_data.append(most_common_type)
            class_default_data.append(most_common_default)
    
        type_data.append(class_type_data)
        default_data.append(class_default_data)

    return type_data, default_data


sklearn_type_data, sklearn_default_data = get_param_type("sklearn", "../data/library_data/sklearn_estimators.json", sklearn_data)
tensorflow_type_data, tensorflow_default_data = get_param_type("tensorflow", "../data/library_data/tensorflow_optimizer.json", tensorflow_data)
torch_type_data, torch_default_data = get_param_type("torch", "../data/library_data/torch_optimizer.json", torch_data)

df_sklearn["Type"] = sklearn_type_data
df_sklearn["Default"] = sklearn_default_data
df_tensorflow["Type"] = tensorflow_type_data
df_tensorflow["Default"] = tensorflow_default_data
df_pytorch["Type"] = torch_type_data
df_pytorch["Default"] = torch_default_data

#df_to_latex(df_tensorflow)

df_all_mew = pd.concat([df_sklearn, df_tensorflow, df_pytorch])
print(df_all_mew.to_latex(index=False))

\begin{tabular}{lrllll}
\toprule
                 Algorithm &  Algorithm Count &                                Top Hyperparameters & Top Hyperparameter Count &                              Type &                              Default \\
\midrule
        LogisticRegression &               33 &                          [C, solver, random\_state] &             [15, 13, 12] &              [variable, str, int] & [customized, customized, customized] \\
                       SVC &               28 &                                 [gamma, kernel, C] &                [9, 6, 4] &             [int, str, Subscript] & [customized, customized, customized] \\
                    KMeans &               22 &                   [n\_clusters, random\_state, init] &              [22, 13, 7] &              [variable, int, str] & [customized, customized, customized] \\
GradientBoostingClassifier &               20 &        [n\_estimators, learning\_rate, random\_state] &             [19, 17, 15] &         