In [16]:
import json
import pandas as pd
import glob

In [17]:
with open("repos.json", "r", encoding="utf-8") as src:
    repos = json.load(src)
    print("Number repos: ", len(repos))


repo_files = set()

for x in glob.glob("../data/statistics/*"):
    file_name = x.split("\\")[-1]
    if file_name in repos:
        repo_files.add(x)

print("Number files: ", len(repo_files))

Number repos:  982
Number files:  982


In [18]:
def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

def get_module(name, data):
    try:
        return next(filter(lambda x: name == x["name"], data))
    except StopIteration:
        return None

def count_parameters(library_name: str, library_dir: str, files: set) -> pd.DataFrame:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    total_params_set = 0
    total_params_available = 0
    default_params = 0
    customized_params = 0
    for project in list(files):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]
                                
                                library_module_data = get_module(class_name, library_data)

                                if not library_module_data:
                                    continue

                                library_module_params = library_module_data["params"]
                                total_params_available += len(library_module_params)

                                for name, value in data.items():
                                    if name in ("variable", "params"):
                                        continue
                                    else:
                                        total_params_set += 1
                                        if name in library_module_params.keys():

                                            if str(library_module_params[name]).replace("'", "") == value["value"]:
                                                default_params += 1
                                            else:
                                                customized_params += 1
                                        else:
                                            customized_params += 1


    assert total_params_set == default_params + customized_params

    df = pd.DataFrame()
    df["Library"] = [library_name]
    df["Available"] = [total_params_available]
    df["Set"] = [total_params_set]
    df["Set_%"] = [round(total_params_set/total_params_available, 3)]
    df["Default"] = [default_params]
    df["Default_%"] = [round(default_params/total_params_set, 3)]
    df["Custom"] = [customized_params]
    df["Custom_%"] = [round(customized_params/total_params_set, 3)]

    return df


df_sklearn = count_parameters("sklearn", "../modules/sklearn_default_values.json" , repo_files)
df_tf = count_parameters("tensorflow", "../modules/tensorflow_default_values.json" , repo_files)
df_pytorch = count_parameters("torch", "../modules/torch_default_values.json" , repo_files)
df_all = pd.concat([df_sklearn, df_tf, df_pytorch])

df_to_latex(df=df_all)


\begin{tabular}{lrrrrrrr}
\toprule
   Library &  Available &    Set &  Set\_\% &  Default &  Default\_\% &  Custom &  Custom\_\% \\
\midrule
   sklearn &       5599 &   1223 &  0.218 &      184 &      0.150 &    1039 &     0.850 \\
tensorflow &      68347 &  17233 &  0.252 &      675 &      0.039 &   16558 &     0.961 \\
     torch &     324045 & 126789 &  0.391 &    10684 &      0.084 &  116105 &     0.916 \\
\bottomrule
\end{tabular}



  print(df.to_latex(index=False))


In [19]:
from collections import Counter

def get_parameters(library_name: str, library_dir: str, files: set) -> pd.DataFrame:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    params_set = []

    for project in list(files):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]
                                
                                library_module_data = get_module(name=class_name, data=library_data)
                                                                
                                if not library_module_data:
                                    continue 

                                library_module_params = library_module_data["params"]

                                for name, value in data.items():
                                    if name in ("variable", "params"):
                                        continue
                                    else:
                                        if name in library_module_params.keys():
                                            param_name = class_name + "::" + name
                                            params_set.append(param_name)
                                                
                                        
    
    param_counter = Counter(params_set)

    return param_counter.most_common()
   
params = get_parameters("sklearn", "../modules/sklearn_estimators.json" , repo_files)