In [78]:
import json
import pandas as pd
import glob
from collections import Counter
from typing import List, Dict
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
statistic_dir = "../data/statistics/"

with open("../data/repos_hyperparameter_tuning.json", "r", encoding="utf-8") as src:
    repos = json.load(src)
    print("Number repos: ", len(repos))


Number repos:  514


In [79]:
def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

def get_module(name, data):
    try:
        return next(filter(lambda x: name == x["name"], data))
    except StopIteration:
        return None

In [80]:
def get_classes(library_name: str, library_dir: str, files: List) -> Dict:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [x["name"] for x in library_data]

    # Get Most used Class
    classes = []

    for project in files:
        with open(statistic_dir + project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, _ in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]

                                if class_name in class_names:
                                    classes.append(class_name)

    return Counter(classes).most_common()

In [81]:
def get_params(library_name, files, classes) -> List:
    param_data = []
    value_data = []
    
    for class_name in classes:
        params_set = []

        for project in list(files):
            with open(statistic_dir + project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for key, data in module_data.items():
                                if key[0].isupper():
                                    module_name_parts = key.split("_")
                                    if len(module_name_parts) > 2:
                                        module_name = "_".join(module_name_parts[:-1])
                                    else:
                                        module_name = module_name_parts[0]

                                    if class_name == module_name:
                                        for name, _ in data.items():
                                            if name in ("variable", "params"):
                                                continue
                                            else:
                                                if name == "lr" and library_name == "tensorflow":
                                                    params_set.append("learning_rate")
                                                else:
                                                    params_set.append(name)
                                                        
                                            
        param_counter = Counter(params_set).most_common(3)
        param_data.append(param_counter)
        
    return param_data

In [82]:
def create_dataframe(top_classes, top_classes_count, top_params, top_params_count):

    df = pd.DataFrame()
    df["Algorithm"] = top_classes
    df["Algorithm Count"] = top_classes_count
    df["Top Hyperparameters"] = top_params
    df["Top Hyperparameter Count"] = top_params_count

    return df


In [83]:
def get_all_data(library_name: str, library_data: str, files: List[str]) -> pd.DataFrame:
    # get all estimator classes
    classes = get_classes(library_name, library_data, repos)
    # get top ten estimator classes
    top_classes = [x[0] for x in classes][:5]
    # get count for the top ten estimator classes
    top_classes_count = [x[1] for x in classes][:5]

    # get top three hyperparameter for each class
    params = get_params(library_name, repos, top_classes)

    top_params = []  
    top_params_count = []  
    for item in params:
        _params = [x[0] for x in item]
        _count = [x[1] for x in item]
        top_params.append(_params)
        top_params_count.append(_count)

    return create_dataframe(top_classes, top_classes_count, top_params, top_params_count)

df_sklearn = get_all_data("sklearn", "../data/library_data/sklearn_estimators.json", repos)
df_tensorflow = get_all_data("tensorflow", "../data/library_data/tensorflow_optimizer.json", repos)
df_pytorch = get_all_data("torch", "../data/library_data/torch_optimizer.json", repos)
df_all = pd.concat([df_sklearn, df_tensorflow, df_pytorch])

#print(df_all.to_latex(index=False))

In [84]:
def get_method_data(df: pd.DataFrame):

    method_data = []

    for _, row in df.iterrows():
        params = []
        name = row[0]
        count = row[1]
        for (x, y) in zip(row[2], row[3]):
            params.append({"name": x, "count": y})

        
        method_data.append({"name": name, "count": count, "params": params})
    
    return method_data


sklearn_method_data = get_method_data(df_sklearn)
tensorflow_method_data = get_method_data(df_tensorflow)
torch_method_data = get_method_data(df_pytorch)



In [85]:
def prepare_library_data(library_name, library_dir, data, files):

    tmp_data = []


    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [x["name"] for x in library_data]

    for item in data:
        class_name = item["name"]
        param_data = []
        for param in item["params"]:
            param_name = param["name"]
            param_values = []
            
            
            for project in list(files):
                with open(statistic_dir + project, "r", encoding="utf-8") as project_file:
                    project_data = json.load(project_file)

                    for file in project_data.keys():
                        file_data = project_data[file]
                        for library in file_data.keys():
                            if library == library_name:
                                module_data = file_data[library]
                                for key, data in module_data.items():
                                    if key[0].isupper():
                                        module_name_parts = key.split("_")
                                        if len(module_name_parts) > 2:
                                            module_name = "_".join(module_name_parts[:-1])
                                        else:
                                            module_name = module_name_parts[0]

                                        if class_name == module_name:
                                            for name, data in data.items():
                                                if name in ("variable", "params"):
                                                    continue
                                                else:
                                                    if name == "lr" and library_name == "tensorflow":
                                                        #print("Class, Param, Data: ", class_name, name, data)
                                                        param_values.append(data)
                                                    else:
                                                        if name == param_name:
                                                            param_values.append(data)
                                                            #print("Class, Param, Data: ", class_name, name, data)
            param_data.append({"name": param_name, "data": param_values})
        tmp_data.append({"name": class_name, "param_data": param_data})

    return tmp_data

sklearn_data = prepare_library_data("sklearn", "../data/library_data/sklearn_estimators.json", sklearn_method_data, repos)
tensorflow_data = prepare_library_data("tensorflow", "../data/library_data/tensorflow_optimizer.json", tensorflow_method_data, repos)
torch_data = prepare_library_data("torch", "../data/library_data/torch_optimizer.json", torch_method_data, repos)



In [98]:
def assign_category(type_name: str) -> str:

    if type_name.lower() in ("str", "joinedstr"):
        return "String"

    if type_name.lower() in ("int", "float", "complex", "number","binop"):
        return "Numeric"

    if type_name.lower() in ("list", "tuple", "listcomp", "generatorexp"):
        return "Sequence"

    if type_name.lower() in ("dict", "dictcomp", "kwargs"):
        return "Mapping"
    
    if type_name.lower() in ("set", "setcomp"):
        return "Set"

    if type_name.lower() in ("lambda", "subscript"):
        return "Call"

    if type_name.lower() in ("compare", "ifexp"):
        return "Operation"

    if type_name.lower() == "none":
        return "None Type"

    if type_name.lower() in ("method argument", "starred", "variable", "attribute", "yield"):
        return "Variable"
    
    if type_name.lower() in ("bool", "boolop", "unaryop"):
        return "Bool"

    return type_name



def assign_top_category(type_name: str) -> str:
    if type_name.lower() in ("str", "joinedstr", "int", "float", "complex", "number","binop", "bool", "boolop", "unaryop", "dict", "dictcomp", "kwargs", "list", "tuple", "listcomp", "generatorexp", "set", "setcomp", "none"):
        return "Constant"
    else:
        return "Variabel"
    



def get_param_type(library_name, library_dir, data) -> List[List]:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    type_data = []
    default_data = []

    constant_data = []
    variabel_data = []

    for item in data:
        class_name = item["name"]
        class_type_data = []
        class_default_data = []
        class_constant_data = []
        class_variabel_data = []
        for param in item["param_data"]:
            constant = 0
            variabel = 0
            types = []
            default = []
            param_name = param["name"]
            modulue_data = get_module(class_name, library_data)
            options = modulue_data["params"]


            for x in param["data"]:               
                
                # Check types of parameters
                if x["type"].lower() in ("method argument", "starred", "variable", "attribute", "yield"):
                    possible_values = x["possible_values"]
                    if possible_values:
                        for possible_value in possible_values:
                            val = possible_value[0]
                            val_type = possible_value[1]
                            if val.isnumeric():
                                types.append("number")
                            elif val.lower() in ("true", "false"):
                                types.append("bool")
                            else:
                                types.append(val_type)
                    else:
                        types.append(x["type"])
                else:
                    types.append(x["type"])
                    
                if x["type"].lower() in ("str", "joinedstr", "int", "float", "complex", "number","binop", "bool", "boolop", "unaryop", "dict", "dictcomp", "kwargs", "list", "tuple", "listcomp", "generatorexp", "set", "setcomp", "none"):
                    constant += 1
                else:
                    variabel += 1
            


            counter_types = Counter(types)
            #print("Class Name: ", class_name)
            #print("Param: ", param_name)
            #print("Types:", counter_types.most_common())
            most_common_type = counter_types.most_common(1)[0][0]
            #print("Most Common Type:", assign_category(most_common_type))
            class_type_data.append(assign_category(most_common_type))
            class_constant_data.append(constant)
            class_variabel_data.append(variabel)

        constant_data.append(class_constant_data)
        variabel_data.append(class_variabel_data)
        type_data.append(class_type_data)

    return type_data, constant_data, variabel_data


sklearn_type_data, sklearn_constant_data, sklearn_variabel_data = get_param_type("sklearn", "../data/library_data/sklearn_estimators.json", sklearn_data)
tensorflow_type_data, tf_constant_data, tf_variabel_data= get_param_type("tensorflow", "../data/library_data/tensorflow_optimizer.json", tensorflow_data)
torch_type_data, torch_constant_data, torch_variabel_data = get_param_type("torch", "../data/library_data/torch_optimizer.json", torch_data)

df_sklearn["Type"] = sklearn_type_data
df_sklearn["Constant"] = sklearn_constant_data
df_sklearn["Variabel"] = sklearn_variabel_data
df_tensorflow["Type"] = tensorflow_type_data
df_tensorflow["Constant"] = tf_constant_data
df_tensorflow["Variabel"] = tf_variabel_data
df_pytorch["Type"] = torch_type_data
df_pytorch["Constant"] = torch_constant_data
df_pytorch["Variabel"] = torch_variabel_data


#df_to_latex(df_pytorch)

df_all_mew = pd.concat([df_sklearn, df_tensorflow, df_pytorch])

total_count = df_all_mew["Top Hyperparameter Count"].tolist()
total_const = df_all_mew["Constant"].tolist()
total_var = df_all_mew["Variabel"].tolist()
print(total_count)

tmp_total = []
tmp_const = []
tmp_var = []
for item in total_var:
    for x in item:
        tmp_var.append(x)

for item in total_const:
    for x in item:
        tmp_const.append(x)

for item in total_count:
    for x in item:
        tmp_total.append(x)

print("Total: ", sum(tmp_total))
print("Const: ", sum(tmp_const))
print("Var: ", sum(tmp_var))

print(df_all_mew.to_latex(index=False))

[[15, 13, 12], [9, 6, 4], [22, 13, 7], [19, 17, 15], [12, 8, 7], [405, 223, 19], [81, 12, 8], [42, 2], [30, 7, 1], [29, 29, 12], [601, 188, 181], [219, 151, 101], [44, 25, 24], [18, 13, 11], [11, 7, 3]]
Total:  2676
Const:  951
Var:  1753
\begin{tabular}{lrlllll}
\toprule
                 Algorithm &  Algorithm Count &                                Top Hyperparameters & Top Hyperparameter Count &                           Type &       Constant &      Variabel \\
\midrule
        LogisticRegression &               33 &                          [C, solver, random\_state] &             [15, 13, 12] &    [Variable, String, Numeric] &     [3, 13, 8] &    [12, 0, 4] \\
                       SVC &               28 &                                 [gamma, kernel, C] &                [9, 6, 4] &        [Numeric, String, Call] &      [6, 4, 1] &     [3, 2, 3] \\
                    KMeans &               22 &                   [n\_clusters, random\_state, init] &              [22, 13, 7] &   