In [5]:
import glob
import json
import re
from xmlrpc.client import boolean
import plotly.graph_objs as go
import pandas as pd
from typing import List, Dict, Tuple
from ml_object import SklearnModule
from operator import *

SKL_MODULES_FILE = "../data/sklearn/modules/sklearn_modules.json"
ALL_PROJECTS = "statistics/sklearn/statistics/*"
DEFAULT_REGEX = re.compile(r".+=.+")

SKLEARN_OPTION_COUNT = "statistics/sklearn/sklearn_option_count.json"

In [6]:
def option_exists(option: List, params: List) -> boolean:
    param_names = [param[0] for param in params]
    if option[0] in param_names:
        return True
    else:
        return False

def get_key_value_pairs(data: Dict) -> List[Tuple]:
    return [(key, value) for key, value in data.items()]


def check_for_default_values(module_name: str, option_data: List[Tuple], sklearn_data: Dict) -> List[Tuple]:
    options = []

    for key in sklearn_data:
        if key["name"] == module_name:
            params = key["params"]
            for option in option_data:
                for param in params:
                    if option[0] == param[0]:
                        default = param[1]
                        if DEFAULT_REGEX.search(default):
                            default_parts = default.split("=")
                            default_value = default_parts[1]
                            if default_value == option[1]:
                                options.append((option[0], option[1], "default"))
                            else:
                                options.append((option[0], option[1], "custom"))
                        else:
                            options.append((option[0], option[1], "required"))
                    else: 
                        if option_exists(option, params):
                            continue
                        else:
                            options.append((option[0], option[1], "unknown"))

    return options


def create_modules(module_data, projects) -> List:
    all_objects: List[SklearnModule] = []

    for project in glob.glob(projects):
        
        with open(project) as f:
            project_json = json.load(f)

            for key in project_json:
                file = project_json[key]
                if file:
                    for module in file:
                        sklearn_object_parts = module.split("_")
                        name = sklearn_object_parts[0]
                        line_nr = sklearn_object_parts[1]
                        option_data = get_key_value_pairs(file[module])
                        final_option_data = check_for_default_values(name, option_data, module_data)
                        all_objects.append(SklearnModule(name=name, file_name=file, line_nr=line_nr, options=final_option_data))

    return all_objects


def get_counter(ml_modules) -> Dict:
    results = {}

    modules = {module.name for module in ml_modules}

    for module in modules:
        default_counter = 0
        custom_counter = 0
        required_counter = 0
        option_counter = 0
        unknown_counter = 0
        target_modules = list(filter(lambda x: x.name == module, ml_modules)) 
        for target_module in target_modules:
            for option in target_module.options:
                option_counter += 1
                if option[2] == "default":
                    default_counter += 1
                if option[2] == "custom":
                    custom_counter += 1
                if option[2] == "required":
                    required_counter += 1
                if option[2] == "unknown":
                    unknown_counter += 1

        results[module] = {
            "total": option_counter,
            "default": default_counter,
            "custom": custom_counter,
            "required": required_counter, 
            "unknown": unknown_counter,
        }
    
    return results

In [7]:
def get_modules_without_options(modules_option_count) -> List:
    modules_without_options = []

    for key in modules_option_count:
        module = modules_option_count[key]
        if module["total"] == 0:
            modules_without_options.append(key)

    return modules_without_options

def sort_by_option_type(file_path, sort_criteon="default", list_size=5):
    with open(file_path) as f:
        data = json.load(f)

    #print(data)

    #for module in sorted(data,key=lambda x:data[x]['default']):
    #    print(module)

    sorted_data = list(sorted(data.items(),key=lambda x:getitem(x[1],sort_criteon),reverse=True))
    return sorted_data[:list_size]



In [8]:
with open(SKL_MODULES_FILE) as f:
    sklearn_data = json.load(f)

sklearn_modules = create_modules(sklearn_data, ALL_PROJECTS)
sklearn_option_count = get_counter(sklearn_modules)
sklearn_modules_without_options = get_modules_without_options(sklearn_option_count)

print("TOTAL NUMBER")
print("ML algorithms with the most default options:")
most_default_options = sort_by_option_type(SKLEARN_OPTION_COUNT, "default", 5)
print(most_default_options)
print("==============================")
print("ML algorithms with the most custom options:")
most_custom_options = sort_by_option_type(SKLEARN_OPTION_COUNT, "custom", 5)
print(most_custom_options)
print("==============================")
print("ML algorithms with the most required options:")
most_required_options = sort_by_option_type(SKLEARN_OPTION_COUNT, "required", 5)
print(most_required_options)


# TODO: Calculate average number of options per module

ML algorithms with the most default options:
[('MinMaxScaler', {'custom': 0, 'default': 9, 'required': 0, 'total': 9, 'unknown': 0}), ('TSNE', {'custom': 14, 'default': 5, 'required': 0, 'total': 19, 'unknown': 0}), ('KFold', {'custom': 12, 'default': 4, 'required': 0, 'total': 16, 'unknown': 0}), ('TfidfVectorizer', {'custom': 23, 'default': 4, 'required': 0, 'total': 27, 'unknown': 0}), ('CountVectorizer', {'custom': 4, 'default': 3, 'required': 0, 'total': 7, 'unknown': 0})]
ML algorithms with the most custom options:
[('KMeans', {'custom': 38, 'default': 2, 'required': 0, 'total': 40, 'unknown': 0}), ('TfidfVectorizer', {'custom': 23, 'default': 4, 'required': 0, 'total': 27, 'unknown': 0}), ('TSNE', {'custom': 14, 'default': 5, 'required': 0, 'total': 19, 'unknown': 0}), ('KFold', {'custom': 12, 'default': 4, 'required': 0, 'total': 16, 'unknown': 0}), ('StratifiedKFold', {'custom': 12, 'default': 1, 'required': 0, 'total': 19, 'unknown': 6})]
ML algorithms with the most required 