In [1]:
import pandas as pd
import json
import glob
from collections import Counter
from typing import List, Dict

In [2]:
pd.set_option('display.max_colwidth', None)

def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

In [3]:
def get_library_classes(library_name: str, library_dir: str, project_dir: str) -> pd.DataFrame:
    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    # Get Most used Class
    classes = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, _ in module_data.items():
                            if key[0].isupper():
                                class_name = key.split("_")[0]
                                classes.append(class_name)

    class_data = Counter(classes)
    df_classes = pd.DataFrame.from_dict(class_data, orient="index").reset_index()
    df_classes = df_classes.rename(columns={'index':'Class', 0:'Count'})
    df_classes = df_classes.sort_values(by=['Count'], ascending=False)

    # Get Number of API Options and categories
    class_options = []
    categories = []
    classes = df_classes["Class"].to_list()

    for ml_class in classes:
        try:
            class_data = next(filter(lambda x: x["name"] == ml_class, library_data))
            class_options.append(len(class_data["params"]))
            parts = class_data["full_name"].split(".")
            if len(parts) <= 2:
                category = parts[0]
            else:
                category = parts[1]
            categories.append(category)
        except StopIteration:
            print("Could not find: ", ml_class, library_name)
            class_options.append(0)
            categories.append("Unknown")
            continue

    df_classes["Category"] = categories
    df_classes["#HP"] = class_options

    # Compute average number of options used per class and most used option
    avg_class_options = []
    most_used_class_option = []

    for ml_class in classes:
        avg_class_options_used = []
        class_options = []
        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if module_name[0].isupper():
                                    name = module_name.split("_")[0]
                                    if ml_class == name:
                                        counter = 0
                                        for param in data.keys():
                                            if param == "variable":
                                                continue

                                            if  param not in ("variable", "params", "class"):
                                                counter += 1

                                            if param == "params":
                                                class_options.append("default")
                                            else:
                                                class_options.append(param)
            
                                        avg_class_options_used.append(counter)
                                
        avg_class_options.append(round((sum(avg_class_options_used) / len(avg_class_options_used)),2))
        class_option_data = Counter(class_options)
        try:
            most_common_number = class_option_data.most_common(1)[0][1]
            options = [elem[0] for elem in class_option_data.most_common() if elem[1] == most_common_number]
            option_str = ", ".join(options)
            most_used_class_option.append(option_str)
        except IndexError:
            most_used_class_option.append("None")

    df_classes["AvgOptionsUsed"] = avg_class_options
    df_classes["Most Used HP"] = most_used_class_option
    
    return df_classes


df_sklearn_classes = get_library_classes("sklearn", "modules/sklearn_default_values.json" , "data/statistics/*")
df_sklearn_classes = df_sklearn_classes[:10]
df_tf_classes = get_library_classes("tensorflow", "modules/tensorflow_default_values.json" , "data/statistics/*")
df_tf_classes = df_tf_classes[:10]
df_pytorch_classes = get_library_classes("torch", "modules/torch_default_values.json" , "data/statistics/*")
df_pytorch_classes = df_pytorch_classes[:10]
df_all_classes = pd.concat([df_sklearn_classes, df_tf_classes, df_pytorch_classes])

df_to_latex(df=df_all_classes)


\begin{tabular}{lrlrrl}
\toprule
             Class &  Count &      Category &  \#HP &  AvgOptionsUsed &        Most Used HP \\
\midrule
    StandardScaler &    115 & preprocessing &    3 &            0.10 &             default \\
            KMeans &     93 &       cluster &    9 &            1.83 &          n\_clusters \\
               PCA &     68 & decomposition &    9 &            1.03 &        n\_components \\
LogisticRegression &     57 &  linear\_model &   15 &            2.14 &              solver \\
   GaussianMixture &     52 &       mixture &   14 &            2.25 &        n\_components \\
      MinMaxScaler &     51 & preprocessing &    3 &            0.20 &             default \\
  LinearRegression &     44 &  linear\_model &    5 &            0.14 &             default \\
  NearestNeighbors &     44 &     neighbors &    8 &            1.95 &         n\_neighbors \\
     OneHotEncoder &     40 & preprocessing &    7 &            0.82 &              sparse \\
           

In [4]:
def get_library_methods(library_name: str, library_dir: str, project_dir: str) -> pd.DataFrame:
    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    methods = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, _ in module_data.items():
                            if key[0].islower():
                                method_name_parts = key.split("_")[:-1]
                                method_name = "_".join(method_name_parts)
                                #for item in library_data:
                                #    if item["name"] == method_name:
                                methods.append(method_name)

    method_data = Counter(methods)
    df_methods = pd.DataFrame.from_dict(method_data, orient="index").reset_index()
    df_methods = df_methods.rename(columns={'index':'Method', 0:'Count'})
    df_methods = df_methods.sort_values(by=['Count'], ascending=False)

    # Compute number of args that can be set regarding the API data
    method_options = []
    methods = df_methods["Method"].to_list()
    categories = []

    for method in methods:
        try:
            method_data = next(filter(lambda x: x["name"] == method, library_data))
            method_options.append(len(method_data["params"]))
            parts = method_data["full_name"].split(".")
            if len(parts) <= 2:
                category = parts[0]
            else:
                category = parts[1]
            categories.append(category)
        except StopIteration:
            print("Could not find: ", method, library_name)
            method_options.append(0)
            categories.append("Unknown")
            continue

    df_methods["Category"] = categories
    df_methods["#Args"] = method_options

    # Compute average number of args used per method
    avg_method_args = []
    most_used_method_args = []

    for method in methods:
        avg_method_args_used = []
        method_args = []
        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if module_name[0].islower():
                                    method_name_parts = module_name.split("_")[:-1]
                                    method_name = "_".join(method_name_parts)
                                    if method == method_name:
                                        counter = 0
                                        for arg in data.keys():
                                            if arg == "variable":
                                                continue

                                            if arg not in ("variable", "params", "class"):
                                                counter += 1

                                            if arg == "params":
                                                method_args.append("default")
                                            else:
                                                method_args.append(arg)
                                        
                                        avg_method_args_used.append(counter)
                                
        avg_method_args.append(round((sum(avg_method_args_used) / len(avg_method_args_used)),2))
        method_arg_data = Counter(method_args)
        try:
            most_common_num = method_arg_data.most_common(1)[0][1]
            args = [x[0] for x in method_arg_data.most_common() if x[1] == most_common_num]
            arg_str = ", ".join(args)
            most_used_method_args.append(arg_str)
        except IndexError:
            most_used_method_args.append("None")

    df_methods["AvgArgsUsed"] = avg_method_args
    df_methods["Most Used Args"] = most_used_method_args

    return df_methods

df_sklearn_methods = get_library_methods("sklearn", "modules/sklearn_default_values.json", "data/statistics/*")
df_sklearn_methods = df_sklearn_methods[:10]
df_tf_methods = get_library_methods("tensorflow", "modules/tensorflow_default_values.json", "data/statistics/*")
df_tf_methods = df_tf_methods[:10]
df_torch_methods = get_library_methods("torch", "modules/torch_default_values.json", "data/statistics/*")
df_torch_methods = df_torch_methods[:10]
df_all_methods = pd.concat([df_sklearn_methods, df_tf_methods, df_torch_methods])

df_to_latex(df=df_all_methods)


\begin{tabular}{lrlrrl}
\toprule
                 Method &  Count &        Category &  \#Args &  AvgArgsUsed &  Most Used Args \\
\midrule
       train\_test\_split &    215 & model\_selection &      6 &         3.66 &       *arrays\_0 \\
               f1\_score &    175 &         metrics &      7 &         2.84 &  y\_true, y\_pred \\
         accuracy\_score &    162 &         metrics &      4 &         2.01 &  y\_true, y\_pred \\
          roc\_auc\_score &    121 &         metrics &      7 &         2.13 & y\_true, y\_score \\
        check\_is\_fitted &     83 &           utils &      4 &         1.39 &       estimator \\
            check\_array &     75 &           utils &     13 &         2.21 &           array \\
              normalize &     75 &   preprocessing &      5 &         1.61 &               X \\
average\_precision\_score &     74 &         metrics &      5 &         2.16 & y\_true, y\_score \\
                shuffle &     68 &           utils &      3 &         1.

In [31]:
def get_algorithms(library_name: str, library_dir: str, project_dir: str) -> pd.DataFrame:
    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    # Get Most used Class
    classes = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, value in module_data.items():
                            if key[0].isupper():
                                class_name = key.split("_")[0]
                                for item in library_data:
                                    if item["name"] == class_name:
                                        classes.append(class_name)


    class_data = Counter(classes)
    df_classes = pd.DataFrame.from_dict(class_data, orient="index").reset_index()
    df_classes = df_classes.rename(columns={'index':'Class', 0:'Count'})
    df_classes = df_classes.sort_values(by=['Count'], ascending=False)

    # Get Number of API Options and category
    categories = []
    class_options = []
    classes = df_classes["Class"].to_list()

    for ml_class in classes:
        try:
            class_data = next(filter(lambda x: x["name"] == ml_class, library_data))
            class_options.append(len(class_data["params"]))
            category = class_data["full_name"].split(".")[1]
            categories.append(category)
        except StopIteration:
            print("Could not find: ", ml_class)
            continue
            #raise StopIteration()

    df_classes["Category"] = categories
    df_classes["#HP"] = class_options

    # Compute average number of options used per class and most used option
    avg_class_options = []
    most_used_class_option = []
    # compute how often a algorithm is initialized with and without params
    init_with_params = []
    init_without_params = []

    for ml_class in classes:
        with_params = 0
        without_params = 0
        avg_class_options_used = []
        class_options = []
        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if module_name[0].isupper():
                                    name = module_name.split("_")[0]
                                    if ml_class == name:
                                        counter = 0

                                        if "params" in data:
                                            without_params += 1
                                        else:
                                            with_params += 1

                                        for param in data.keys():
                                            if param == "variable":
                                                continue

                                            if param not in ("variable", "params", "class"):
                                                counter += 1

                                            if param == "params":
                                                class_options.append("default")
                                            else:
                                                class_options.append(param)
            
                                        avg_class_options_used.append(counter)


        init_with_params.append(with_params)
        init_without_params.append(without_params)                  
        avg_class_options.append(round((sum(avg_class_options_used) / len(avg_class_options_used)),2))
        class_option_data = Counter(class_options)
        try:
            #most_common_number = class_option_data.most_common(1)[0][1]
            #options = [elem[0] for elem in class_option_data.most_common() if elem[1] == most_common_number]
            #option_str = ", ".join(options)
            most_used_class_option.append(class_option_data.most_common(1)[0][0])
        except IndexError:
            most_used_class_option.append("None")

    df_classes["Init With Params"] = init_with_params
    df_classes["Init Without Params"] = init_without_params
    df_classes["AvgOptionsUsed"] = avg_class_options
    df_classes["Most Used HP"] = most_used_class_option

    return df_classes

df_sklearn_ml_algo = get_algorithms("sklearn", "modules/sklearn_ml_algorithms.json", "data/statistics/*")
df_sklearn_ml_algo = df_sklearn_ml_algo[:30]

df_sklearn_ml_algo = df_sklearn_ml_algo[["Class", "Count", "Init With Params", "Init Without Params", "#HP", "AvgOptionsUsed"]]
df_sklearn_ml_algo = df_sklearn_ml_algo.sort_values(by=['Count'], ascending=False)

df_to_latex(df=df_sklearn_ml_algo)

\begin{tabular}{lrrrrr}
\toprule
                    Class &  Count &  Init With Params &  Init Without Params &  \#HP &  AvgOptionsUsed \\
\midrule
                   KMeans &     93 &                86 &                    7 &    9 &            1.83 \\
                      PCA &     68 &                64 &                    4 &    9 &            1.03 \\
       LogisticRegression &     57 &                47 &                   10 &   15 &            2.14 \\
          GaussianMixture &     52 &                52 &                    0 &   14 &            2.25 \\
         LinearRegression &     44 &                 5 &                   39 &    5 &            0.14 \\
         NearestNeighbors &     44 &                44 &                    0 &    8 &            1.95 \\
  AgglomerativeClustering &     38 &                26 &                   12 &    8 &            2.21 \\
     KNeighborsClassifier &     29 &                18 &                   11 &    8 &            0.93 \\
   

In [10]:
def get_value_types(df, library_name, project_dir) -> pd.DataFrame:
    value_type = []

    for _, row in df_sklearn_ml_algo.iterrows():
        class_value_types = []

        df_class_name = row["Class"]
        df_class_option_name = row["Most Used HP"].split(",")[0]

        for project in glob.glob(project_dir):
                with open(project, "r", encoding="utf-8") as project_file:
                    project_data = json.load(project_file)

                    for file in project_data.keys():
                        file_data = project_data[file]
                        for library in file_data.keys():
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if library == library_name:
                                    module_data = file_data[library]
                                    for module_name, data in module_data.items():
                                        if module_name[0].isupper():
                                            name = module_name.split("_")[0]
                                            if df_class_name == name:
                                                for key, value in data.items():
                                                    if key == df_class_option_name:
                                                        #print(df_class_name, df_class_option_name, key, value)
                                                        possible_values = value["possible_values"]
                                                        if possible_values:
                                                            for x in possible_values:
                                                                class_value_types.append(x[1])
                                                        else:
                                                            class_value_types.append(value["type"])

        value_type_data = Counter(class_value_types)    
        top_types = value_type_data.most_common(1)
        try:
            if top_types:
                types = top_types[0][0]
                #types = [str(x[0][0]) for x in top_types]
                #types = ", ".join(types)
            else:
                types = "-"
        except Exception:
            print(row["Class"], row["Most Used HP"], value_type_data.most_common(3))
            print(top_types)
            types = "None"

        value_type.append(types)                                                   
    
    df["Common Value Types"] = value_type

    return df

df = get_value_types(df_sklearn_ml_algo, "sklearn", "data/statistics/*")
df_to_latex(df=df)

\begin{tabular}{lrlrrll}
\toprule
                    Class &  Count &       Category &  \#HP &  AvgOptionsUsed & Most Used HP & Common Value Types \\
\midrule
                   KMeans &     93 &        cluster &    9 &            1.83 &   n\_clusters &               Call \\
                      PCA &     68 &  decomposition &    9 &            1.03 & n\_components &           Constant \\
       LogisticRegression &     57 &   linear\_model &   15 &            2.14 &       solver &           Constant \\
          GaussianMixture &     52 &        mixture &   14 &            2.25 & n\_components &               Call \\
         LinearRegression &     44 &   linear\_model &    5 &            0.14 &      default &                  - \\
         NearestNeighbors &     44 &      neighbors &    8 &            1.95 &  n\_neighbors &           Constant \\
  AgglomerativeClustering &     38 &        cluster &    8 &            2.21 &   n\_clusters &           Constant \\
     KNeighborsClassi

In [7]:
df_sklearn_exp_settings = get_algorithms("sklearn", "modules/sklearn_experimental_settings.json", "data/statistics/*")
#df_sklearn_ml_algo = df_sklearn_ml_algo[:30]

df_sklearn_exp_settings = df_sklearn_exp_settings[["Class", "Category", "Count", "#HP", "AvgOptionsUsed", "Most Used HP"]]
df_sklearn_exp_settings = df_sklearn_exp_settings.sort_values(by=['Count'], ascending=False)

#df_sklearn_exp_settings = get_value_types(df_sklearn_exp_settings, "sklearn", "statistics/*")

df_to_latex(df=df_sklearn_exp_settings)

\begin{tabular}{llrrrl}
\toprule
                 Class &             Category &  Count &  \#HP &  AvgOptionsUsed & Most Used HP \\
\midrule
        StandardScaler &        preprocessing &    115 &    3 &            0.10 &      default \\
          MinMaxScaler &        preprocessing &     51 &    3 &            0.20 &      default \\
                  TSNE &             manifold &     40 &   16 &            2.23 & n\_components \\
         OneHotEncoder &        preprocessing &     40 &    7 &            0.82 &       sparse \\
          LabelEncoder &        preprocessing &     35 &    0 &            0.00 &      default \\
       CountVectorizer &   feature\_extraction &     21 &   17 &            5.29 &    tokenizer \\
       TfidfVectorizer &   feature\_extraction &     21 &   21 &            2.67 &  ngram\_range \\
        LabelBinarizer &        preprocessing &     10 &    3 &            0.10 &      default \\
   FunctionTransformer &        preprocessing &      7 &    8 &        

In [3]:
def count_init_type_of_algorithms(library_dir: str, project_dir: str) -> pd.DataFrame:
    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)

    # Get Most used Class
    classes = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for key, value in module_data.items():
                            if key[0].isupper():
                                class_name = key.split("_")[0]
                                for item in library_data:
                                    if item["name"] == class_name:
                                        classes.append(class_name)
                                        


    class_data = Counter(classes)
    df_classes = pd.DataFrame.from_dict(class_data, orient="index").reset_index()
    df_classes = df_classes.rename(columns={'index':'Class', 0:'Count'})
    df_classes = df_classes.sort_values(by=['Count'], ascending=False)
    

    init_with_params = []
    init_without_params = []
    classes = df_classes["Class"].to_list()

    for ml_class in classes:
        with_params = 0
        without_params = 0
        for project in glob.glob(project_dir):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == "sklearn":
                            module_data = file_data[library]
                            for module_name, data in module_data.items():
                                if module_name[0].isupper():
                                    name = module_name.split("_")[0]
                                    if ml_class == name:
                                        if "params" in data:
                                            without_params += 1
                                        else:
                                            with_params += 1
        

        init_with_params.append(with_params)
        init_without_params.append(without_params)

    
    df_classes["Init With Params"] = init_with_params
    df_classes["Init Without Params"] = init_without_params

    return df_classes

df_sklearn_ml_algo = count_init_type_of_algorithms("modules/sklearn_estimators.json", "data/statistics/*")
df_sklearn_ml_algo = df_sklearn_ml_algo[:30]
df_sklearn_ml_algo = df_sklearn_ml_algo.sort_values(by=['Count'], ascending=False)

df_to_latex(df=df_sklearn_ml_algo)

\begin{tabular}{lrrr}
\toprule
                  Class &  Count &  Init With Params &  Init Without Params \\
\midrule
         StandardScaler &    115 &                 7 &                  108 \\
                 KMeans &     93 &                86 &                    7 \\
                    PCA &     68 &                64 &                    4 \\
     LogisticRegression &     57 &                47 &                   10 \\
        GaussianMixture &     52 &                52 &                    0 \\
           MinMaxScaler &     51 &                10 &                   41 \\
       LinearRegression &     44 &                 5 &                   39 \\
       NearestNeighbors &     44 &                44 &                    0 \\
          OneHotEncoder &     40 &                27 &                   13 \\
                   TSNE &     40 &                38 &                    2 \\
AgglomerativeClustering &     38 &                26 &                   12 \\
           L