In [62]:
from typing import List, Tuple, Dict

STATISTICS_DIR = "data/statistics/"
SKL_MODULES_FILE = "data/sklearn/modules/sklearn_modules.json"
ALIGNARR_PROJECT = "data/statistics/statistics_alignarr.json"


In [63]:
import glob
import json

with open(SKL_MODULES_FILE) as f:
    skl_modules_json = json.load(f)

with open(ALIGNARR_PROJECT) as f:
    project_json = json.load(f)

print(project_json)


{'alignarr.py': {'CountVectorizer_430': {'min_df': '1', 'ngram_range': '(1, 1)', 'max_features': '5000'}, 'CountVectorizer_431': {}}, 'alignment_annotation/compute_agreement.py': {}, 'evaluation.py': {}, 'name_linking.py': {'DBSCAN_118': {'eps': '0.1', 'min_samples': '1', 'metric': 'precomputed'}}}


In [64]:
class SklearnObject:
    def __init__(self, name, file_name, line_nr, options):
        self.name: str = name
        self.file_name: str = file_name
        self.line_nr: int = line_nr
        self.options: List[Tuple] = options   


In [65]:
def get_key_value_pairs(data: Dict) -> List[Tuple]:
    return [(key, value) for key, value in data.items()]

In [66]:
def check_for_default_values(module_name: str, option_data: List[Tuple], sklearn_data: Dict) -> List[Tuple]:
    options = []

    for key in sklearn_data:
        if key["name"] == module_name:
            params = key["params"]
            #print("params: ", params)
            for option in option_data:
                for param in params:
                    if option[0] == param[0]:
                        default = param[1]
                        default_parts = default.split("=")
                        default_value = default_parts[1]
                        if default_value == option[1]:
                            options.append((option[0], option[1], "default"))
                        else:
                            options.append((option[0], option[1], "custom"))

    return options


In [67]:
objects: List[SklearnObject] = []

for key in project_json:
    file = project_json[key]
    if file:
        for module in file:
            sklearn_object_parts = module.split("_")
            name = sklearn_object_parts[0]
            line_nr = sklearn_object_parts[1]
            option_data = get_key_value_pairs(file[module])
            final_option_data = check_for_default_values(name, option_data, skl_modules_json)

            objects.append(SklearnObject(name=name, file_name=file, line_nr=line_nr, options=final_option_data))
            

for sklearn_obj in objects:
    print("================")
    print(sklearn_obj.name)
    print(sklearn_obj.line_nr)
    print(sklearn_obj.file_name)
    print(sklearn_obj.options)


CountVectorizer
430
{'CountVectorizer_430': {'min_df': '1', 'ngram_range': '(1, 1)', 'max_features': '5000'}, 'CountVectorizer_431': {}}
[('min_df', '1', 'default'), ('ngram_range', '(1, 1)', 'default'), ('max_features', '5000', 'custom')]
CountVectorizer
431
{'CountVectorizer_430': {'min_df': '1', 'ngram_range': '(1, 1)', 'max_features': '5000'}, 'CountVectorizer_431': {}}
[]
DBSCAN
118
{'DBSCAN_118': {'eps': '0.1', 'min_samples': '1', 'metric': 'precomputed'}}
[('eps', '0.1', 'custom'), ('min_samples', '1', 'custom'), ('metric', 'precomputed', 'custom')]


In [68]:
import plotly.graph_objs as go
import pandas as pd

default_counter = 0
custom_counter = 0
option_counter = 0

for sklearn_obj in objects:
    for option in sklearn_obj.options:
        option_counter += 1
        if option[2] == "default":
            default_counter += 1
        if option[2] == "custom":
            custom_counter += 1

print("Number of option: ", option_counter)

params = ["total", "default", "custom"]
values = [option_counter, default_counter, custom_counter]

data = [go.Bar(
    x=params,
    y=values
)]

fig = go.Figure(data=data)
fig.show()


Number of option:  6
