In [174]:
import json
import pandas as pd
import glob
from collections import Counter
from typing import List, Dict
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

with open("../data/repos_hyperparameter_tuning.json", "r", encoding="utf-8") as src:
    repos = json.load(src)
    print("Number repos: ", len(repos))


Number repos:  514


In [175]:
def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

def get_module(name, data):
    try:
        return next(filter(lambda x: name == x["name"], data))
    except StopIteration:
        return None

In [176]:
def get_classes(library_name: str, library_dir: str, files: List) -> Dict:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [x["name"] for x in library_data]

    # Get Most used Class
    classes = []

    for project in files:
        with open("../data/statistics/" + project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, _ in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]

                                if class_name in class_names:
                                    classes.append(class_name)

    return Counter(classes).most_common()

In [177]:
def get_params(library_name, files, classes) -> List:

    param_data = []
    
    for class_name in classes:
        params_set = []

        for project in list(files):
            with open("../data/statistics/" + project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for key, data in module_data.items():
                                if key[0].isupper():
                                    module_name_parts = key.split("_")
                                    if len(module_name_parts) > 2:
                                        module_name = "_".join(module_name_parts[:-1])
                                    else:
                                        module_name = module_name_parts[0]

                                    if class_name == module_name:
                                        #if module_name == "Adam":
                                        #    print("Project: ", project)
                                        #    print("Library: ", library_name)
                                        #    print("Data", data.items())

                                        for name, _ in data.items():
                                            if name in ("variable", "params"):
                                                continue
                                            else:
                                                if name == "lr" and library_name == "tensorflow":
                                                    params_set.append("learning_rate")
                                                else:
                                                    params_set.append(name)
                                                        
                                            
        
        param_counter = Counter(params_set).most_common(3)
        param_data.append(param_counter)
        
    return param_data

In [178]:
def create_dataframe(top_classes, top_classes_count, top_params, top_params_count):

    df = pd.DataFrame()
    df["Algorithm"] = top_classes
    df["Algorithm Count"] = top_classes_count
    df["Top Hyperparameters"] = top_params
    df["Top Hyperparameter Count"] = top_params_count

    return df


In [179]:
def get_all_data(library_name: str, library_data: str, files: List[str]) -> pd.DataFrame:
    # get all estimator classes
    classes = get_classes(library_name, library_data, repos)
    # get top ten estimator classes
    top_classes = [x[0] for x in classes][:5]
    # get count for the top ten estimator classes
    top_classes_count = [x[1] for x in classes][:5]

    # get top three hyperparameter for each class
    params = get_params(library_name, repos, top_classes)

    top_params = []  
    top_params_count = []  
    for item in params:
        _params = [x[0] for x in item]
        _count = [x[1] for x in item]
        top_params.append(_params)
        top_params_count.append(_count)

    return create_dataframe(top_classes, top_classes_count, top_params, top_params_count)

df_sklearn = get_all_data("sklearn", "../modules/sklearn_estimators.json", repos)
df_tensorflow = get_all_data("tensorflow", "../modules/tensorflow_optimizer.json", repos)
df_pytorch = get_all_data("torch", "../modules/torch_optimizer.json", repos)
df_all = pd.concat([df_sklearn, df_tensorflow, df_pytorch])


print(df_all.to_latex(index=False))

\begin{tabular}{lrll}
\toprule
                 Algorithm &  Algorithm Count &                         Top Hyperparameters & Top Hyperparameter Count \\
\midrule
        LogisticRegression &               33 &                   [C, solver, random\_state] &             [15, 13, 12] \\
                       SVC &               28 &                          [gamma, kernel, C] &                [9, 6, 4] \\
                    KMeans &               22 &            [n\_clusters, random\_state, init] &              [22, 13, 7] \\
GradientBoostingClassifier &               20 & [n\_estimators, learning\_rate, random\_state] &             [19, 17, 15] \\
                 LinearSVC &               17 &                     [C, dual, class\_weight] &               [12, 8, 7] \\
                      Adam &               91 &         [learning\_rate, epsilon, clipvalue] &              [81, 12, 8] \\
                       SGD &                8 &                   [learning\_rate, momentum] &    

In [180]:
def get_method_data(df: pd.DataFrame):

    method_data = []

    for _, row in df.iterrows():
        params = []
        name = row[0]
        count = row[1]
        for (x, y) in zip(row[2], row[3]):
            params.append({"name": x, "count": y})

        
        method_data.append({"name": name, "count": count, "params": params})
    
    return method_data


sklearn_method_data = get_method_data(df_sklearn)



