In [50]:
from typing import List, Tuple, Dict
import json
import plotly.graph_objs as go
import glob
import re

from ml_object import SklearnModule

SKL_MODULES_FILE = "../data/sklearn/modules/sklearn_modules.json"
SKLEARN_PROJECTS = "statistics/sklearn/statistics/*"

In [51]:
with open(SKL_MODULES_FILE) as f:
    skl_modules_json = json.load(f)

In [52]:
DEFAULT_REGEX = re.compile(r".+=.+")

def check_if_option_exists(option, params):
    param_names = [param[0] for param in params]
    if option[0] in param_names:
        return True
    else:
        return False

def get_key_value_pairs(data: Dict) -> List[Tuple]:
    return [(key, value) for key, value in data.items()]


def check_for_default_values(module_name: str, option_data: List[Tuple], sklearn_data: Dict) -> List[Tuple]:
    options = []

    for key in sklearn_data:
        if key["name"] == module_name:
            params = key["params"]
            for option in option_data:
                for param in params:
                    if option[0] == param[0]:
                        default = param[1]
                        if DEFAULT_REGEX.search(default):
                            default_parts = default.split("=")
                            default_value = default_parts[1]
                            if default_value == option[1]:
                                options.append((option[0], option[1], "default"))
                            else:
                                options.append((option[0], option[1], "custom"))
                        else:
                            options.append((option[0], option[1], "required"))
                    else: 
                        if check_if_option_exists(option, params):
                            continue
                        else:
                            options.append((option[0], option[1], "unknown"))

    return options

In [53]:
def create_modules(projects):

    all_objects: List[SklearnModule] = []

    for project in glob.glob(projects):
        
        with open(project) as f:
            project_json = json.load(f)

            for key in project_json:
                file = project_json[key]
                if file:
                    for module in file:
                        sklearn_object_parts = module.split("_")
                        name = sklearn_object_parts[0]
                        line_nr = sklearn_object_parts[1]
                        option_data = get_key_value_pairs(file[module])
                        final_option_data = check_for_default_values(name, option_data, skl_modules_json)

                        all_objects.append(SklearnModule(name=name, file_name=file, line_nr=line_nr, options=final_option_data))

    return all_objects

In [54]:
def get_counter(modules):
    default_counter = 0
    custom_counter = 0
    required_counter = 0
    option_counter = 0
    unknown_counter = 0

    for sklearn_obj in modules:
        for option in sklearn_obj.options:
            option_counter += 1
            if option[2] == "default":
                default_counter += 1
            if option[2] == "custom":
                custom_counter += 1
            if option[2] == "required":
                required_counter += 1
            if option[2] == "unknown":
                unknown_counter += 1

    return {
        "default": default_counter,
        "total": option_counter,
        "custom": custom_counter,
        "required": required_counter,
    }


In [55]:
all_objects = create_modules(SKLEARN_PROJECTS)
counter = get_counter(all_objects)

print("Number of option: ", counter)

params = ["total", "default", "custom", "required"]
values = [counter["total"], counter["default"], counter["custom"], counter["required"]]

data = [go.Bar(
    x=params,
    y=values
)]

fig = go.Figure(data=data)
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)
fig.show()

Number of option:  {'default': 44, 'total': 276, 'custom': 192, 'required': 9}


In [56]:
labels = ["total options", "default options", "custom options", "not specified in api"]
values = [counter["total"], counter["default"], counter["custom"], counter["required"]]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='percent+value')])
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)
fig.show()

In [57]:
import pandas as pd

df = pd.read_json("statistics/sklearn/statistics/statistics_alignarr.json")

df.head(5)

Unnamed: 0,alignarr.py,alignment_annotation/compute_agreement.py,evaluation.py,name_linking.py
CountVectorizer_430,"{'min_df': '1', 'ngram_range': '(1, 1)', 'max_...",,,
CountVectorizer_431,{},,,
DBSCAN_118,,,,"{'eps': '0.1', 'min_samples': '1', 'metric': '..."
