In [412]:
import glob
import json
from collections import Counter
import pandas as pd

SKLEARN_DATA = "modules/sklearn_default_values.json"
STATISTICS_DATA_DIR = "statistics/*"

Load Scitit Learn API data

In [413]:
with open(SKLEARN_DATA, "r", encoding="utf-8") as sklearn_file:
    sklearn_data = json.load(sklearn_file)

Compute the number of used ML classes and methods

In [414]:
classes = []
methods = []

for project in glob.glob(STATISTICS_DATA_DIR):
    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

        for file in project_data.keys():
            file_data = project_data[file]
            for library in file_data.keys():
                if library == "sklearn":
                    module_data = file_data[library]
                    for key, value in module_data.items():
                        if key[0].isupper():
                            class_name = key.split("_")[0]
                            if "base_class_0" in value:
                                continue
                            #if class_name == "TopKRanker":
                            #    print("project: ", project)
                            #    print("file", file)
                            classes.append(class_name)
                        if key[0].islower():
                            method_name_parts = key.split("_")[:-1]
                            method_name = "_".join(method_name_parts)
                            methods.append(method_name)

class_data = Counter(classes)
df_classes = pd.DataFrame.from_dict(class_data, orient="index").reset_index()
df_classes = df_classes.rename(columns={'index':'Class', 0:'Count'})
df_classes = df_classes.sort_values(by=['Count'], ascending=False)

print("Number of ML classes: ", len(class_data))
df_classes



Number of ML classes:  81


Unnamed: 0,Class,Count
0,StandardScaler,44
4,LogisticRegression,40
8,KMeans,30
11,MinMaxScaler,25
33,Pipeline,21
...,...,...
35,Perceptron,1
32,SGDOneClassSVM,1
25,ShuffleSplit,1
20,SVR,1


In [415]:
method_data = Counter(methods)
df_methods = pd.DataFrame.from_dict(method_data, orient="index").reset_index()
df_methods = df_methods.rename(columns={'index':'Method', 0:'Count'})
df_methods = df_methods.sort_values(by=['Count'], ascending=False)

print("Number of ML methods: ", len(df_methods))
df_methods

Number of ML methods:  81


Unnamed: 0,Method,Count
0,train_test_split,122
1,confusion_matrix,67
16,f1_score,65
12,mean_squared_error,60
25,check_array,41
...,...,...
53,davies_bouldin_score,1
54,calinski_harabasz_score,1
57,hamming_loss,1
59,normalized_mutual_info_score,1


Compute the number of options that can be set regarding the API data

In [416]:
class_options = []
method_options = []

classes = df_classes["Class"].to_list()
methods = df_methods["Method"].to_list()

for ml_class in classes:
    try:
        sklearn_class = next(filter(lambda x: x["name"] == ml_class, sklearn_data))
        class_options.append(len(sklearn_class["params"]))
    except StopIteration:
        print("Could not find: ", ml_class)
        raise StopIteration()


for method in methods:
    sklearn_method = next(filter(lambda x: x["name"] == method, sklearn_data))
    method_options.append(len(sklearn_method["params"]))

df_classes["#HP"] = class_options
df_methods["#Args"] = method_options
df_classes

Unnamed: 0,Class,Count,#HP
0,StandardScaler,44,3
4,LogisticRegression,40,15
8,KMeans,30,9
11,MinMaxScaler,25,3
33,Pipeline,21,3
...,...,...,...
35,Perceptron,1,16
32,SGDOneClassSVM,1,12
25,ShuffleSplit,1,4
20,SVR,1,11


In [417]:
df_methods

Unnamed: 0,Method,Count,#Args
0,train_test_split,122,6
1,confusion_matrix,67,5
16,f1_score,65,7
12,mean_squared_error,60,5
25,check_array,41,13
...,...,...,...
53,davies_bouldin_score,1,2
54,calinski_harabasz_score,1,2
57,hamming_loss,1,3
59,normalized_mutual_info_score,1,3


Compute average number of options used per class/method

In [418]:
avg_class_options = []

for ml_class in classes:
    avg_class_options_used = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, data in module_data.items():
                            if module_name[0].isupper():
                                name = module_name.split("_")[0]
                                if ml_class == name:
                                    counter = 0
                                    for param in data:
                                        if not param == "variable" and not param == "params":
                                            counter += 1

                                    avg_class_options_used.append(counter)
                            
    avg_class_options.append(round((sum(avg_class_options_used) / len(avg_class_options_used)),2))

df_classes["AvgOptionsUsed"] = avg_class_options
df_classes

In [None]:
avg_method_args = []

for method in methods:
    avg_method_args_used = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, data in module_data.items():
                            if module_name[0].islower():
                                method_name_parts = module_name.split("_")[:-1]
                                method_name = "_".join(method_name_parts)
                                if method == method_name:
                                    counter = 0
                                    for arg in data:
                                        if not arg == "variable" and not arg == "params":
                                            counter += 1
                                    
                                    avg_method_args_used.append(counter)
                            
    avg_method_args.append(round((sum(avg_method_args_used) / len(avg_method_args_used)),2))

df_methods["AvgArgsUsed"] = avg_method_args
df_methods

Compute most used Option/Arg

In [None]:
most_used_class_option = []

for ml_class in classes:
    class_options = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, data in module_data.items():
                            if module_name[0].isupper():
                                name = module_name.split("_")[0]
                                if ml_class == name:
                                    for param in data.keys():
                                        if param == "variable":
                                            continue
                                        if param == "params":
                                            class_options.append("default")
                                        else:
                                            class_options.append(param)
        
    class_option_data = Counter(class_options)
    try:
        options = [elem for elem in class_option_data.most_common()]
        option_str = ", ".join(options)
        most_used_class_option.append(option_str)
    except IndexError:
        most_used_class_option.append("None")
    
df_classes["Most Used HP"] = most_used_class_option
df_classes

In [None]:
most_used_method_args = []

for method in methods:
    methods_args = []
    for project in glob.glob(STATISTICS_DATA_DIR):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == "sklearn":
                        module_data = file_data[library]
                        for module_name, data in module_data.items():
                            if module_name[0].islower():
                                method_name_parts = module_name.split("_")[:-1]
                                method_name = "_".join(method_name_parts)
                                if method == method_name:
                                    for param in data.keys():
                                        if param == "variable":
                                            continue
                                        if param == "params":
                                            methods_args.append("default")
                                        else:
                                            methods_args.append(param)
    
    method_arg_data = Counter(methods_args)
    try:
        args = [elem for elem in method_arg_data.most_common()]
        arg_str = ", ".join(args)
        most_used_method_args.append(arg_str)
    except IndexError:
        most_used_method_args.append("None")

    #print(method_arg_data)
    
df_methods["Most Used Args"] = most_used_method_args
df_methods

In [None]:
print(df_classes.to_latex(index=False))  

In [None]:
print(df_methods.to_latex(index=False))  