In [34]:
import glob
import json
import pandas as pd

SKLEARN_DATA = "../data/sklearn/modules/sklearn_modules.json"

ALL_SKLEARN_PROJECTS = "statistics/sklearn/statistics/*"
ALL_TENSORFLOW_PROJECTS = "statistics/tensorflow/statistics/*"
ALL_PYTORCH_PROJECTS = "statistics/pytorch/statistics/*"

In [35]:
with open(SKLEARN_DATA, "r", encoding="utf-8") as sklearn_file:
    sklearn_data = json.load(sklearn_file)

data = []

for project in glob.glob(ALL_SKLEARN_PROJECTS):
    project_name = project.split("\\")[-1].split(".")[0]
    project_name = project_name.replace("statistics_", "")

    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

    for file in project_data:
        file_data = project_data[file]

        for module in file_data:
            module_name = module.split("_")[0]
            module_data = file_data[module]

            sklearn_module = next(filter(lambda x: x["name"] == module_name, sklearn_data))

            total_options = len(sklearn_module["params"])

            if module_name == "KMeans":
                data.append({
                    "project": project_name,
                    "total_options": total_options,
                    "options_used": len(module_data),
                    "portion": len(module_data)/total_options
                })

In [36]:
modules = set()

for project in glob.glob(ALL_SKLEARN_PROJECTS):
    project_name = project.split("\\")[-1].split(".")[0]
    project_name = project_name.replace("statistics_", "")

    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

    for file in project_data:
        file_data = project_data[file]

        for module in file_data:
            module_name = module.split("_")[0]
            modules.add(module_name)


print(modules)


{'OneHotEncoder', 'AgglomerativeClustering', 'LinearSVC', 'StratifiedKFold', 'BayesianGaussianMixture', 'PCA', 'MinMaxScaler', 'StandardScaler', 'Ridge', 'LabelEncoder', 'LinearRegression', 'KMeans', 'LatentDirichletAllocation', 'HuberRegressor', 'LabelBinarizer', 'MeanShift', 'TfidfVectorizer', 'KDTree', 'GridSearchCV', 'DecisionTreeClassifier', 'MaxAbsScaler', 'LeaveOneGroupOut', 'GaussianNB', 'SVC', 'RandomForestClassifier', 'NearestNeighbors', 'GroupKFold', 'KFold', 'TruncatedSVD', 'MultiLabelBinarizer', 'KernelDensity', 'GaussianMixture', 'SVR', 'DBSCAN', 'MLPClassifier', 'PolynomialFeatures', 'ParameterSampler', 'Lasso', 'KernelPCA', 'LogisticRegression', 'TSNE', 'KNeighborsClassifier', 'DecisionTreeRegressor', 'GaussianProcessRegressor', 'RandomForestRegressor', 'CountVectorizer'}


In [37]:
all_data = {}

for name in modules:
    data = []

    for project in glob.glob(ALL_SKLEARN_PROJECTS):
        project_name = project.split("\\")[-1].split(".")[0]
        project_name = project_name.replace("statistics_", "")

        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

        for file in project_data:
            file_data = project_data[file]

            for module in file_data:
                default_counter = 0
                custom_counter = 0
                module_name = module.split("_")[0]
                module_data = file_data[module]

                sklearn_module = next(filter(lambda x: x["name"] == module_name, sklearn_data))

                total_options = len(sklearn_module["params"])

                if module_name == name:
                    if module_data:
                        for option in module_data:
                            try:
                                param = next(filter(lambda x: x[0] == option, sklearn_module["params"]))
                                default_value = param[1].split("=")[-1]
                                option_value = module_data[option]   
                                if default_value == option_value:
                                    default_counter += 1
                                else:
                                    custom_counter +=1
                            except:
                                custom_counter +=1

                    data.append({
                        "project": project_name,
                        "file": file,
                        "total_options": total_options,
                        "options_used": len(module_data),
                        "defualt_options_used": default_counter,
                        "custom_options_used": custom_counter
                    })

    all_data[name] = data


with open("sklearn_modules_data.json", "w") as outfile:
    json.dump(all_data, outfile, sort_keys=True, indent=4)