In [1]:
import glob
import json
from collections import Counter
import pandas as pd

SKLEARN_DATA = "modules/sklearn_default_values.json"
STATISTICS_DATA_DIR = "statistics/*"

In [2]:
# Pandas configuration
pd.set_option('display.max_colwidth', None)

Load Scitit Learn API data

In [3]:
with open(SKLEARN_DATA, "r", encoding="utf-8") as sklearn_file:
    sklearn_data = json.load(sklearn_file)

Compute the number of used ML classes and methods

In [4]:
classes = []
methods = []

for project in glob.glob(STATISTICS_DATA_DIR):
    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

        for file in project_data.keys():
            file_data = project_data[file]
            for library in file_data.keys():
                if library == "sklearn":
                    module_data = file_data[library]
                    for key, value in module_data.items():
                        if key[0].isupper():
                            class_name = key.split("_")[0]
                            if "base_class_0" in value:
                                continue
                            classes.append(class_name)
                        if key[0].islower():
                            method_name_parts = key.split("_")[:-1]
                            method_name = "_".join(method_name_parts)
                            methods.append(method_name)

class_data = Counter(classes)
df_classes = pd.DataFrame.from_dict(class_data, orient="index").reset_index()
df_classes = df_classes.rename(columns={'index':'Class', 0:'Count'})
df_classes = df_classes.sort_values(by=['Count'], ascending=False)

print("Number of ML classes: ", len(class_data))
df_classes



Number of ML classes:  81


Unnamed: 0,Class,Count
0,StandardScaler,44
4,LogisticRegression,40
8,KMeans,30
11,MinMaxScaler,25
33,Pipeline,21
...,...,...
35,Perceptron,1
32,SGDOneClassSVM,1
25,ShuffleSplit,1
20,SVR,1


In [5]:
method_data = Counter(methods)
df_methods = pd.DataFrame.from_dict(method_data, orient="index").reset_index()
df_methods = df_methods.rename(columns={'index':'Method', 0:'Count'})
df_methods = df_methods.sort_values(by=['Count'], ascending=False)

print("Number of ML methods: ", len(df_methods))
df_methods

Number of ML methods:  83


Unnamed: 0,Method,Count
0,train_test_split,143
1,confusion_matrix,68
16,f1_score,68
12,mean_squared_error,65
51,load_breast_cancer,43
...,...,...
58,davies_bouldin_score,1
4,make_circles,1
44,mean_squared_log_error,1
47,resample,1


Compute the number of options that can be set regarding the API data

In [6]:
class_options = []
method_options = []

classes = df_classes["Class"].to_list()
methods = df_methods["Method"].to_list()

for ml_class in classes:
    try:
        sklearn_class = next(filter(lambda x: x["name"] == ml_class, sklearn_data))
        class_options.append(len(sklearn_class["params"]))
    except StopIteration:
        print("Could not find: ", ml_class)
        raise StopIteration()


for method in methods:
    sklearn_method = next(filter(lambda x: x["name"] == method, sklearn_data))
    method_options.append(len(sklearn_method["params"]))

df_classes["#HP"] = class_options
df_methods["#Args"] = method_options
df_classes

Unnamed: 0,Class,Count,#HP
0,StandardScaler,44,3
4,LogisticRegression,40,15
8,KMeans,30,9
11,MinMaxScaler,25,3
33,Pipeline,21,3
...,...,...,...
35,Perceptron,1,16
32,SGDOneClassSVM,1,12
25,ShuffleSplit,1,4
20,SVR,1,11


In [7]:
df_methods

Unnamed: 0,Method,Count,#Args
0,train_test_split,143,6
1,confusion_matrix,68,5
16,f1_score,68,7
12,mean_squared_error,65,5
51,load_breast_cancer,43,2
...,...,...,...
58,davies_bouldin_score,1,2
4,make_circles,1,5
44,mean_squared_log_error,1,5
47,resample,1,5


Compute average number of options used per class/method

In [8]:
avg_class_options = []

for ml_class in classes:
    avg_class_options_used = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, data in module_data.items():
                            if module_name[0].isupper():
                                name = module_name.split("_")[0]
                                if ml_class == name:
                                    counter = 0
                                    for param in data:
                                        if not param == "variable" and not param == "params":
                                            counter += 1

                                    avg_class_options_used.append(counter)
                            
    avg_class_options.append(round((sum(avg_class_options_used) / len(avg_class_options_used)),2))

df_classes["AvgOptionsUsed"] = avg_class_options
df_classes

Unnamed: 0,Class,Count,#HP,AvgOptionsUsed
0,StandardScaler,44,3,0.20
4,LogisticRegression,40,15,1.12
8,KMeans,30,9,2.30
11,MinMaxScaler,25,3,0.52
33,Pipeline,21,3,1.00
...,...,...,...,...
35,Perceptron,1,16,1.00
32,SGDOneClassSVM,1,12,1.00
25,ShuffleSplit,1,4,3.00
20,SVR,1,11,1.00


In [9]:
avg_method_args = []

for method in methods:
    avg_method_args_used = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, data in module_data.items():
                            if module_name[0].islower():
                                method_name_parts = module_name.split("_")[:-1]
                                method_name = "_".join(method_name_parts)
                                if method == method_name:
                                    counter = 0
                                    for arg in data:
                                        if not arg == "variable" and not arg == "params":
                                            counter += 1
                                    
                                    avg_method_args_used.append(counter)
                            
    avg_method_args.append(round((sum(avg_method_args_used) / len(avg_method_args_used)),2))

df_methods["AvgArgsUsed"] = avg_method_args
df_methods

Unnamed: 0,Method,Count,#Args,AvgArgsUsed
0,train_test_split,143,6,3.68
1,confusion_matrix,68,5,2.10
16,f1_score,68,7,2.79
12,mean_squared_error,65,5,2.51
51,load_breast_cancer,43,2,0.51
...,...,...,...,...
58,davies_bouldin_score,1,2,2.00
4,make_circles,1,5,3.00
44,mean_squared_log_error,1,5,2.00
47,resample,1,5,3.00


Compute most used Option/Arg

In [10]:
most_used_class_option = []
most_used_class_option_num = []

for ml_class in classes:
    class_options = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, data in module_data.items():
                            if module_name[0].isupper():
                                name = module_name.split("_")[0]
                                if ml_class == name:
                                    for param in data.keys():
                                        if param == "variable":
                                            continue
                                        if param == "params":
                                            class_options.append("default")
                                        else:
                                            class_options.append(param)
        
    class_option_data = Counter(class_options)
    try:
        most_common_number = class_option_data.most_common(1)[0][1]
        options = [elem[0] for elem in class_option_data.most_common() if elem[1] == most_common_number]
        option_str = ", ".join(options)
        most_used_class_option.append(option_str)
        most_used_class_option_num.append(most_common_number)
    except IndexError:
        most_used_class_option.append("None")
        most_used_class_option_num.append(0)
    
df_classes["Most Used HP"] = most_used_class_option
df_classes["Most Used HP Number"] = most_used_class_option_num
df_classes

Project:  statistics\optbinning_params.json KBinsDiscretizer
Project:  statistics\creme_params.json Perceptron
Project:  statistics\creme_params.json SGDOneClassSVM


Unnamed: 0,Class,Count,#HP,AvgOptionsUsed,Most Used HP,Most Used HP Number
0,StandardScaler,44,3,0.20,default,38
4,LogisticRegression,40,15,1.12,default,27
8,KMeans,30,9,2.30,n_clusters,30
11,MinMaxScaler,25,3,0.52,"feature_range, default",12
33,Pipeline,21,3,1.00,steps,21
...,...,...,...,...,...,...
35,Perceptron,1,16,1.00,**kwargs,1
32,SGDOneClassSVM,1,12,1.00,**kwargs,1
25,ShuffleSplit,1,4,3.00,"n_splits, test_size, random_state",1
20,SVR,1,11,1.00,kernel,1


In [11]:
most_used_method_args = []
most_used_method_args_num = []

for method in methods:
    methods_args = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, data in module_data.items():
                            if module_name[0].islower():
                                method_name_parts = module_name.split("_")[:-1]
                                method_name = "_".join(method_name_parts)
                                if method == method_name:
                                    for param in data.keys():
                                        if param == "variable":
                                            continue
                                        if param == "params":
                                            methods_args.append("default")
                                        else:
                                            methods_args.append(param)
    
    method_arg_data = Counter(methods_args)
    try:
        most_common_num = method_arg_data.most_common(1)[0][1]
        args = [x[0] for x in method_arg_data.most_common() if x[1] == most_common_num]
        arg_str = ", ".join(args)
        most_used_method_args.append(arg_str)
        most_used_method_args_num.append(most_common_num)
    except IndexError:
        most_used_method_args.append("None")

    #print(method_arg_data)
    
df_methods["Most Used Args"] = most_used_method_args
df_methods["Most Used Args Number"] = most_used_method_args_num
df_methods

Unnamed: 0,Method,Count,#Args,AvgArgsUsed,Most Used Args,Most Used Args Number
0,train_test_split,143,6,3.68,*arrays_0,143
1,confusion_matrix,68,5,2.10,"y_true, y_pred",68
16,f1_score,68,7,2.79,"y_true, y_pred",68
12,mean_squared_error,65,5,2.51,"y_true, y_pred",65
51,load_breast_cancer,43,2,0.51,default,25
...,...,...,...,...,...,...
58,davies_bouldin_score,1,2,2.00,"X, labels",1
4,make_circles,1,5,3.00,"n_samples, factor, noise",1
44,mean_squared_log_error,1,5,2.00,"y_true, y_pred",1
47,resample,1,5,3.00,"*arrays_0, replace, n_samples",1


In [12]:
print(df_classes.to_latex(index=False))  

\begin{tabular}{lrrrlr}
\toprule
                        Class &  Count &  \#HP &  AvgOptionsUsed &                                                                                                                Most Used HP &  Most Used HP Number \\
\midrule
               StandardScaler &     44 &    3 &            0.20 &                                                                                                                     default &                   38 \\
           LogisticRegression &     40 &   15 &            1.12 &                                                                                                                     default &                   27 \\
                       KMeans &     30 &    9 &            2.30 &                                                                                                                  n\_clusters &                   30 \\
                 MinMaxScaler &     25 &    3 &            0.52 &                           

In [13]:
print(df_methods.to_latex(index=False))  

\begin{tabular}{lrrrlr}
\toprule
                         Method &  Count &  \#Args &  AvgArgsUsed &                                            Most Used Args &  Most Used Args Number \\
\midrule
               train\_test\_split &    143 &      6 &         3.68 &                                                 *arrays\_0 &                    143 \\
               confusion\_matrix &     68 &      5 &         2.10 &                                            y\_true, y\_pred &                     68 \\
                       f1\_score &     68 &      7 &         2.79 &                                            y\_true, y\_pred &                     68 \\
             mean\_squared\_error &     65 &      5 &         2.51 &                                            y\_true, y\_pred &                     65 \\
             load\_breast\_cancer &     43 &      2 &         0.51 &                                                   default &                     25 \\
                  roc\_a

Compute most common class options and method args

In [14]:
class_options = []
method_args = []

for project in glob.glob(STATISTICS_DATA_DIR):
    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

        for file in project_data.keys():
            file_data = project_data[file]
            for library in file_data.keys():
                if library == "sklearn":
                    module_data = file_data[library]
                    for module_name, value in module_data.items():
                        if module_name[0].isupper():
                            if "base_class_0" in value:
                                continue
                            for param in value.keys():
                                if param in  ("params", "variable", "base_class_0"):
                                    continue
                                if param == "eta0":
                                    print("project: ", project)
                                class_options.append(param)
                        if module_name[0].islower():
                            for param in value.keys():
                                if param in  ("params", "variable"):
                                    continue
                                method_args.append(param)


class_option_data = Counter(class_options)
method_arg_data = Counter(method_args)

df_class_options = pd.DataFrame.from_dict(class_option_data, orient="index").reset_index()
df_class_options = df_class_options.rename(columns={'index':'Hyperparameter', 0:'Count'})
df_class_options = df_class_options.sort_values(by=['Count'], ascending=False)

print("Number of ML classes: ", len(class_option_data))
df_class_options

project:  statistics\creme_params.json
Number of ML classes:  116


Unnamed: 0,Hyperparameter,Count
1,random_state,55
7,n_clusters,33
13,n_components,32
22,n_splits,26
49,**kwargs,24
...,...,...
56,bin_seeding,1
47,eta0,1
91,weights_init,1
92,means_init,1


In [15]:
df_method_args = pd.DataFrame.from_dict(method_arg_data, orient="index").reset_index()
df_method_args = df_method_args.rename(columns={'index':'Method Arguments', 0:'Count'})
df_method_args = df_method_args.sort_values(by=['Count'], ascending=False)

print("Number of ML methods: ", len(method_arg_data))
df_method_args

Number of ML methods:  110


Unnamed: 0,Method Arguments,Count
4,y_true,432
5,y_pred,359
0,*arrays_0,160
3,random_state,142
2,test_size,135
...,...,...
76,flip_y,1
77,bias,1
79,mean,1
80,cov,1


Compote most common value for class options and method args

In [16]:
class_options = df_class_options["Hyperparameter"]
methods_args = df_method_args["Method Arguments"]

class_option_value_range = []
most_common_class_option_value = []


for class_option in class_options:
    value_range = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)
            
            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, value in module_data.items():
                            if module_name[0].isupper():
                                if "base_class_0" in value:
                                    continue
                                for param, param_value in value.items():
                                    if class_option == param:
                                        value_range.append(param_value["value"])

    class_option_values = Counter(value_range)
    class_option_value_range.append(len(class_option_values))
    most_common_class_option_value.append(class_option_values.most_common(1)[0][0])

df_class_options["Value Range"] = class_option_value_range
df_class_options["Most Common Value"] = most_common_class_option_value
df_class_options

Unnamed: 0,Hyperparameter,Count,Value Range,Most Common Value
1,random_state,55,12,0
7,n_clusters,33,13,n_clusters
13,n_components,32,13,2
22,n_splits,26,12,n_splits
49,**kwargs,24,9,**params
...,...,...,...,...
56,bin_seeding,1,1,True
47,eta0,1,1,LEARNING_RATE
91,weights_init,1,1,gmm_weights
92,means_init,1,1,centroids


In [17]:
methods_args = df_method_args["Method Arguments"]

method_arg_value_range = []
most_common_method_arg_value = []


for arg in methods_args:
    value_range = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)
            
            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, value in module_data.items():
                            if module_name[0].islower():
                                for param, param_value in value.items():
                                    if arg == param:
                                        value_range.append(param_value["value"])

    method_arg_values = Counter(value_range)
    method_arg_value_range.append(len(method_arg_values))
    most_common_method_arg_value.append(method_arg_values.most_common(1)[0][0])

df_method_args["Value Range"] = method_arg_value_range
df_method_args["Most Common Value"] = most_common_method_arg_value
df_method_args

Unnamed: 0,Method Arguments,Count,Value Range,Most Common Value
4,y_true,432,111,y_true
5,y_pred,359,124,y_pred
0,*arrays_0,160,43,X
3,random_state,142,18,42
2,test_size,135,20,0.1
...,...,...,...,...
76,flip_y,1,1,noise_fraction
77,bias,1,1,features_options['bias']
79,mean,1,1,mean
80,cov,1,1,cov


In [18]:
print(df_class_options.to_latex(index=False))  

\begin{tabular}{lrrl}
\toprule
                 Hyperparameter &  Count &  Value Range &                                                                        Most Common Value \\
\midrule
                   random\_state &     55 &           12 &                                                                                        0 \\
                     n\_clusters &     33 &           13 &                                                                               n\_clusters \\
                   n\_components &     32 &           13 &                                                                                        2 \\
                       n\_splits &     26 &           12 &                                                                                 n\_splits \\
                       **kwargs &     24 &            9 &                                                                                 **params \\
                          steps &     21 &           2

In [19]:
print(df_method_args.to_latex(index=False))  

\begin{tabular}{lrrl}
\toprule
    Method Arguments &  Count &  Value Range &                                                                                 Most Common Value \\
\midrule
              y\_true &    432 &          111 &                                                                                            y\_true \\
              y\_pred &    359 &          124 &                                                                                            y\_pred \\
           *arrays\_0 &    160 &           43 &                                                                                                 X \\
        random\_state &    142 &           18 &                                                                                                42 \\
           test\_size &    135 &           20 &                                                                                               0.1 \\
           *arrays\_1 &    105 &           26 &                  