In [1]:
import glob
import json
from collections import Counter
import pandas as pd

SKLEARN_DATA = "../data/sklearn/modules/sklearn_modules.json"
ALL_SKLEARN_PROJECTS = "statistics/sklearn/statistics/*"
ALL_TENSORFLOW_PROJECTS = "statistics/tensorflow/statistics/*"
ALL_PYTORCH_PROJECTS = "statistics/pytorch/statistics/*"

In [2]:
with open(SKLEARN_DATA, "r", encoding="utf-8") as sklearn_file:
    sklearn_data = json.load(sklearn_file)

Calculate number of modules of all sklearn projects

In [3]:
all_modules = []
modules_without_options = []

for project in glob.glob(ALL_SKLEARN_PROJECTS):
    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

        for file in project_data:
            file_data = project_data[file]
            for module in file_data:
                module_name = module.split("_")[0]
                #print("Name:", module_name)
                all_modules.append(module_name)
                module_data = file_data[module]
                #print("Data:", module_data)
                if not module_data:
                    modules_without_options.append(module_name)

df_all = pd.DataFrame.from_dict(Counter(all_modules), orient="index", columns=["total_number"]).reindex()
df_without_option = pd.DataFrame.from_dict(Counter(modules_without_options), orient="index", columns=["without_options"]).reindex()

result = df_all.join(df_without_option)
result["without_options"] = result['without_options'].fillna(0).astype(int)
df_total = result.sort_values(by=['total_number'], ascending=False)
df_total.head(10)

Unnamed: 0,total_number,without_options
Pipeline,156,37
StandardScaler,43,28
LogisticRegression,33,18
KMeans,32,0
MinMaxScaler,22,5
LinearRegression,20,12
LabelEncoder,15,14
KFold,11,0
PCA,11,2
GridSearchCV,11,0


In [4]:
df_equal = result[result["total_number"] == result["without_options"]]
df_equal = df_equal.sort_values(by=['total_number'], ascending=False)
df_equal

Unnamed: 0,total_number,without_options
GaussianNB,2,2
Perceptron,2,2
RobustScaler,2,2
OneClassSVM,1,1
SGDOneClassSVM,1,1
DummyRegressor,1,1
ExtraTreesClassifier,1,1
KBinsDiscretizer,1,1
LabelBinarizer,1,1


In [5]:
options_provided = []
names = [name for name in df_equal.index]

for name in names:
    sklearn_module = next(filter(lambda x: x["name"] == name, sklearn_data))
    params = sklearn_module["params"]
    options_provided.append(len(params))

print(options_provided)

df_equal["options_provided"] = options_provided

df_equal


[2, 16, 5, 10, 12, 3, 18, 4, 3]


Unnamed: 0,total_number,without_options,options_provided
GaussianNB,2,2,2
Perceptron,2,2,16
RobustScaler,2,2,5
OneClassSVM,1,1,10
SGDOneClassSVM,1,1,12
DummyRegressor,1,1,3
ExtraTreesClassifier,1,1,18
KBinsDiscretizer,1,1,4
LabelBinarizer,1,1,3


In [6]:
df_without = result.sort_values(by=['without_options'], ascending=False)
df_without.head(10)

Unnamed: 0,total_number,without_options
Pipeline,156,37
StandardScaler,43,28
LogisticRegression,33,18
LabelEncoder,15,14
LinearRegression,20,12
MinMaxScaler,22,5
SGDRegressor,4,3
RandomForestClassifier,6,2
GaussianNB,2,2
OneHotEncoder,10,2
