In [252]:
from typing import List
import json
import plotly.graph_objs as go

from sklearn_object import SklearnObject
from sklearn_object import get_key_value_pairs, check_for_default_values

STATISTICS_DIR = "data/statistics/"
SKL_MODULES_FILE = "data/sklearn/modules/sklearn_modules.json"
TEST_PROJECT = "data/statistics/statistics_alignarr.json"
ALL_PROJECTS = "data/statistics/*"


In [253]:
with open(SKL_MODULES_FILE) as f:
    skl_modules_json = json.load(f)

with open(TEST_PROJECT) as f:
    project_json = json.load(f)

print(project_json)


{'alignarr.py': {'CountVectorizer_430': {'min_df': '1', 'ngram_range': '(1, 1)', 'max_features': '5000'}, 'CountVectorizer_431': {}}, 'alignment_annotation/compute_agreement.py': {}, 'evaluation.py': {}, 'name_linking.py': {'DBSCAN_118': {'eps': '0.1', 'min_samples': '1', 'metric': 'precomputed'}}}


Analyze specific repository

In [254]:
objects: List[SklearnObject] = []

for key in project_json:
    file = project_json[key]
    if file:
        for module in file:
            sklearn_object_parts = module.split("_")
            name = sklearn_object_parts[0]
            line_nr = sklearn_object_parts[1]
            option_data = get_key_value_pairs(file[module])
            final_option_data = check_for_default_values(name, option_data, skl_modules_json)

            objects.append(SklearnObject(name=name, file_name=file, line_nr=line_nr, options=final_option_data))
            

for sklearn_obj in objects:
    print("================")
    print(sklearn_obj.name)
    print(sklearn_obj.line_nr)
    print(sklearn_obj.file_name)
    print(sklearn_obj.options)


CountVectorizer
430
{'CountVectorizer_430': {'min_df': '1', 'ngram_range': '(1, 1)', 'max_features': '5000'}, 'CountVectorizer_431': {}}
[('min_df', '1', 'default'), ('ngram_range', '(1, 1)', 'default'), ('max_features', '5000', 'custom')]
CountVectorizer
431
{'CountVectorizer_430': {'min_df': '1', 'ngram_range': '(1, 1)', 'max_features': '5000'}, 'CountVectorizer_431': {}}
[]
DBSCAN
118
{'DBSCAN_118': {'eps': '0.1', 'min_samples': '1', 'metric': 'precomputed'}}
[('eps', '0.1', 'custom'), ('min_samples', '1', 'custom'), ('metric', 'precomputed', 'custom')]


In [255]:
default_counter = 0
custom_counter = 0
option_counter = 0
required_counter = 0

for sklearn_obj in objects:
    for option in sklearn_obj.options:
        option_counter += 1
        if option[2] == "default":
            default_counter += 1
        if option[2] == "custom":
            custom_counter += 1
        if option[2] == "required":
            required_counter += 1

print("Number of option: ", option_counter)

params = ["total", "default", "custom", "required"]
values = [option_counter, default_counter, custom_counter, required_counter]

data = [go.Bar(
    x=params,
    y=values
)]

fig = go.Figure(data=data)
fig.show()


Number of option:  6


Analyze all Repositories

In [256]:
all_objects: List[SklearnObject] = []

for project in glob.glob(ALL_PROJECTS):
    
    with open(project) as f:
        project_json = json.load(f)

        for key in project_json:
            file = project_json[key]
            if file:
                for module in file:
                    sklearn_object_parts = module.split("_")
                    name = sklearn_object_parts[0]
                    line_nr = sklearn_object_parts[1]
                    option_data = get_key_value_pairs(file[module])
                    final_option_data = check_for_default_values(name, option_data, skl_modules_json)

                    all_objects.append(SklearnObject(name=name, file_name=file, line_nr=line_nr, options=final_option_data))
            


In [257]:
default_counter = 0
custom_counter = 0
required_counter = 0
option_counter = 0

for sklearn_obj in all_objects:
    for option in sklearn_obj.options:
        option_counter += 1
        if option[2] == "default":
            default_counter += 1
        if option[2] == "custom":
            custom_counter += 1
        if option[2] == "required":
            required_counter += 1

print("Number of option: ", option_counter)

params = ["total", "default", "custom", "required"]
values = [option_counter, default_counter, custom_counter, required_counter]

data = [go.Bar(
    x=params,
    y=values
)]

fig = go.Figure(data=data)
fig.show()



Number of option:  245


In [258]:
labels = ["# default options", "# custom options", "# options without default value"]
values = [default_counter, custom_counter, required_counter]

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()