In [27]:
import glob
import json
import re
import plotly.graph_objs as go
from typing import List, Dict, Tuple
from sklearn_object import SklearnObject
import pandas as pd

STATISTICS_DIR = "data/statistics/"
SKL_MODULES_FILE = "data/sklearn/modules/sklearn_modules.json"
ALL_PROJECTS = "data/statistics/*"


In [28]:
DEFAULT_REGEX = re.compile(r".+=.+")

def check_if_option_exists(option, params):
    param_names = [param[0] for param in params]
    if option[0] in param_names:
        return True
    else:
        return False

def get_key_value_pairs(data: Dict) -> List[Tuple]:
    return [(key, value) for key, value in data.items()]


def check_for_default_values(module_name: str, option_data: List[Tuple], sklearn_data: Dict) -> List[Tuple]:
    options = []

    for key in sklearn_data:
        if key["name"] == module_name:
            params = key["params"]
            for option in option_data:
                for param in params:
                    if option[0] == param[0]:
                        default = param[1]
                        if DEFAULT_REGEX.search(default):
                            default_parts = default.split("=")
                            default_value = default_parts[1]
                            if default_value == option[1]:
                                options.append((option[0], option[1], "default"))
                            else:
                                options.append((option[0], option[1], "custom"))
                        else:
                            options.append((option[0], option[1], "required"))
                    else: 
                        if check_if_option_exists(option, params):
                            continue
                        else:
                            options.append((option[0], option[1], "unknown"))

    return options

In [29]:
with open(SKL_MODULES_FILE) as f:
    skl_modules_json = json.load(f)

In [30]:
all_objects: List[SklearnObject] = []

for project in glob.glob(ALL_PROJECTS):
    
    with open(project) as f:
        project_json = json.load(f)

        for key in project_json:
            file = project_json[key]
            if file:
                for module in file:
                    sklearn_object_parts = module.split("_")
                    name = sklearn_object_parts[0]
                    if name == "LinearRegression":
                        print(project)
                        print(key)
                    line_nr = sklearn_object_parts[1]
                    option_data = get_key_value_pairs(file[module])
                    final_option_data = check_for_default_values(name, option_data, skl_modules_json)
                    all_objects.append(SklearnObject(name=name, file_name=file, line_nr=line_nr, options=final_option_data))


data/statistics/statistics_100DaysofMLCodeChallenge.json
Regression/Multiple-Linear-Regression/multiple_linear_regression.py
data/statistics/statistics_100DaysofMLCodeChallenge.json
Regression/Polynomial-Regression/polynomial_regression.py
data/statistics/statistics_100DaysofMLCodeChallenge.json
Regression/Polynomial-Regression/polynomial_regression.py
data/statistics/statistics_100DaysofMLCodeChallenge.json
Regression/Simple-Linear-Regression/simple_linear_regression.py
data/statistics/statistics_ECBM6040-Project.json
utils/lane.py
data/statistics/statistics_paper_boltzmann_generators.json
software/deep_boltzmann/networks/plot.py


In [31]:
results = {}

modules = {module.name for module in all_objects}

for module in modules:
    default_counter = 0
    custom_counter = 0
    required_counter = 0
    option_counter = 0
    unknown_counter = 0
    target_modules = list(filter(lambda x: x.name == module, all_objects)) 
    for target_module in target_modules:
        for option in target_module.options:
            option_counter += 1
            if option[2] == "default":
                default_counter += 1
            if option[2] == "custom":
                custom_counter += 1
            if option[2] == "required":
                required_counter += 1
            if option[2] == "unknown":
                unknown_counter += 1

    results[module] = {
        "total": option_counter,
        "default": default_counter,
        "custom": custom_counter,
        "required": required_counter, 
        "unknown": unknown_counter,
    }


with open("data/modules_option_number.json", "w") as f:
    json.dump(results, f, sort_keys=True, indent=4)
   

In [32]:
modules_without_options = []

for key in results:
    module = results[key]
    if module["total"] == 0:
        modules_without_options.append(key)

print(len(modules_without_options))
print(modules_without_options)

7
['MaxAbsScaler', 'LinearRegression', 'LabelBinarizer', 'GaussianNB', 'LabelEncoder', 'MultiLabelBinarizer', 'LeaveOneGroupOut']


In [33]:
df = pd.DataFrame()
df["name"] = results.keys()
df["default"] = [i["default"] for i in results.values()]
df["custom"] = [i["default"] for i in results.values()]

df.head()

Unnamed: 0,name,default,custom
0,KernelPCA,0,0
1,LinearSVC,0,0
2,OneHotEncoder,0,0
3,CountVectorizer,3,3
4,ParameterSampler,0,0


In [34]:
df.sort_values(by=["default"], ascending=False).head(5)

Unnamed: 0,name,default,custom
14,MinMaxScaler,9,9
23,TSNE,5,5
28,TfidfVectorizer,4,4
25,KFold,4,4
3,CountVectorizer,3,3


In [35]:
df.sort_values(by=["custom"], ascending=False).head(5)

Unnamed: 0,name,default,custom
14,MinMaxScaler,9,9
23,TSNE,5,5
28,TfidfVectorizer,4,4
25,KFold,4,4
3,CountVectorizer,3,3
