In [117]:
import glob
import json
import pandas as pd
from collections import Counter

#pd.set_option('display.max_rows', None)

SKLEARN_DATA = "../data/sklearn/modules/sklearn_modules.json"

ALL_SKLEARN_PROJECTS = "statistics/sklearn/statistics/*"
ALL_TENSORFLOW_PROJECTS = "statistics/tensorflow/statistics/*"
ALL_PYTORCH_PROJECTS = "statistics/pytorch/statistics/*"
SKLEARN_RESULTS = "statistics/sklearn/results/"

Load Sklearn API Data

In [118]:
with open(SKLEARN_DATA, "r", encoding="utf-8") as sklearn_file:
    sklearn_data = json.load(sklearn_file)

Identify projects that incorporate Scikit Learn ML algorithms

In [119]:
projects_with_algorithms = []
projects_without_algorithms = []

for project in glob.glob(ALL_SKLEARN_PROJECTS):
    contains_ml_algorithm = False
    project_name = project.split("\\")[-1].split(".")[0]
    project_name = project_name.replace("statistics_", "")

    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

    for file in project_data:
        file_data = project_data[file]

        for module in file_data:
            module_name = module.split("_")[0]
            sklearn_module = next(filter(lambda x: x["name"] == module_name, sklearn_data))
            contains_ml_algorithm = True

        if contains_ml_algorithm:
            projects_with_algorithms.append(project_name)
            break

    if not contains_ml_algorithm:
        projects_without_algorithms.append(project_name)


print("Projects with ML algorithms: ", len(projects_with_algorithms))
print("Projects without ML algorithms: ",len(projects_without_algorithms))
print("Total projects number: ",len(projects_with_algorithms) + len(projects_without_algorithms))

with open(SKLEARN_RESULTS + "projects_without_mla.txt", "w", encoding="utf-8") as source:
    for project in projects_without_algorithms:
        source.write(project + "\n")

with open(SKLEARN_RESULTS + "projects_with_mla.txt", "w", encoding="utf-8") as source:
    for project in projects_with_algorithms:
        source.write(project + "\n")

Projects with ML algorithms:  85
Projects without ML algorithms:  72
Total projects number:  157


Identify all ML algorithms used in projects with ML algorithms

In [120]:
projects_files = [f"statistics/sklearn/statistics/statistics_{name}.json" for name in projects_with_algorithms]

ml_algorithms = []

for project in projects_files:
    
    # Get project data
    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

    # Check each file
    for file in project_data:
        file_data = project_data[file]

        # Extract each ML algorithm
        for module in file_data:    
            module_name = module.split("_")[0]
            ml_algorithms.append(module_name)

data = Counter(ml_algorithms)
df_algo = pd.DataFrame.from_dict(data, orient="index").reset_index()
df_algo = df_algo.rename(columns={'index':'algorithm', 0:'count'})
df_aglo= df_algo.sort_values(by=['count'], ascending=False)

print("Number of ML algorithms: ", len(data))
df_algo

Number of ML algorithms:  77


Unnamed: 0,algorithm,count
0,StandardScaler,42
1,DecisionTreeClassifier,3
2,KNeighborsClassifier,2
3,SVC,5
4,LogisticRegression,32
...,...,...
72,MLPRegressor,1
73,IsotonicRegression,2
74,LabelBinarizer,1
75,ParameterSampler,1


Identify the number of options that can by set regarding the API data

In [121]:
possible_options = []

algos = df_algo["algorithm"].to_list()

for algo in algos:
    sklearn_module = next(filter(lambda x: x["name"] == algo, sklearn_data))
    possible_options.append(len(sklearn_module["params"]))

df_algo["options"] = possible_options

df_algo.head()

Unnamed: 0,algorithm,count,options
0,StandardScaler,42,3
1,DecisionTreeClassifier,3,12
2,KNeighborsClassifier,2,8
3,SVC,5,15
4,LogisticRegression,32,15


Identify the average number of options used per algorithm

In [122]:
avg_options_used = []

for algo in algos:
    algo_options_used = []
    for project in projects_files:
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

        for file in project_data:
            file_data = project_data[file]

            for module in file_data:
                module_name = module.split("_")[0]
                if algo == module_name:
                    module_data = file_data[module]
                    algo_options_used.append(len(module_data))


    avg_options_used.append(round((sum(algo_options_used) / len(algo_options_used)),2))

df_algo["avg_options_used"] = avg_options_used
df_algo.head()

Unnamed: 0,algorithm,count,options,avg_options_used
0,StandardScaler,42,3,0.4
1,DecisionTreeClassifier,3,12,1.33
2,KNeighborsClassifier,2,8,2.5
3,SVC,5,15,2.0
4,LogisticRegression,32,15,1.28


Identify most used option for each Algoritm

In [123]:
# Calculate most used parameter for each ML algorithm

most_used_options = []

for algo in algos:
    algo_options = []
    for project in projects_files:
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

        for file in project_data:
            file_data = project_data[file]

            for module in file_data:
                module_name = module.split("_")[0]
                if algo == module_name:
                    module_data = file_data[module]
                    for param, value in module_data.items():
                        if param == "params" and value == "default":
                            algo_options.append("defaults")
                        else:
                            algo_options.append(param)
    
    data = Counter(algo_options)
    try:
        most_used_options.append(data.most_common(1)[0][0])
    except IndexError:
        most_used_options.append("None")

df_algo["Most Used HP"] = most_used_options
df_algo.head(10)

Unnamed: 0,algorithm,count,options,avg_options_used,Most Used HP
0,StandardScaler,42,3,0.4,defaults
1,DecisionTreeClassifier,3,12,1.33,criterion
2,KNeighborsClassifier,2,8,2.5,n_neighbors
3,SVC,5,15,2.0,kernel
4,LogisticRegression,32,15,1.28,solver
5,GaussianNB,2,2,0.0,
6,RandomForestClassifier,6,18,2.33,n_estimators
7,AgglomerativeClustering,3,8,2.0,n_clusters
8,KMeans,32,9,2.25,n_clusters
9,LabelEncoder,12,0,0.08,defaults


Convert Dataframe into Latex Table

In [124]:
print(df_algo.to_latex(index=False))  

\begin{tabular}{lrrrl}
\toprule
                 algorithm &  count &  options &  avg\_options\_used &         Most Used HP \\
\midrule
            StandardScaler &     42 &        3 &              0.40 &             defaults \\
    DecisionTreeClassifier &      3 &       12 &              1.33 &            criterion \\
      KNeighborsClassifier &      2 &        8 &              2.50 &          n\_neighbors \\
                       SVC &      5 &       15 &              2.00 &               kernel \\
        LogisticRegression &     32 &       15 &              1.28 &               solver \\
                GaussianNB &      2 &        2 &              0.00 &                 None \\
    RandomForestClassifier &      6 &       18 &              2.33 &         n\_estimators \\
   AgglomerativeClustering &      3 &        8 &              2.00 &           n\_clusters \\
                    KMeans &     32 &        9 &              2.25 &           n\_clusters \\
              LabelEnco

Identify all Options used in projects with ML algorithms

In [125]:
projects_files = [f"statistics/sklearn/statistics/statistics_{name}.json" for name in projects_with_algorithms]

options = []

for project in projects_files:
    
    # Get project data
    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

    # Check each file
    for file in project_data:
        file_data = project_data[file]

        # Extract each ML algorithm
        for module in file_data:    
            module_name = module.split("_")[0]
            module_data = file_data[module]
            for param, value in module_data.items():
                if param == "params" and value == "default":
                    continue
                options.append(param)

data = Counter(options)
df_options = pd.DataFrame.from_dict(data, orient="index").reset_index()
df_options = df_options.rename(columns={'index':'Hyperparameter', 0:'count'})
df_options = df_options.sort_values(by=['count'], ascending=False)

print("Number of unique hyperparameter: ", len(data))
df_options.head(10)

Number of unique hyperparameter:  118


Unnamed: 0,Hyperparameter,count
1,random_state,54
7,n_clusters,36
13,n_components,31
45,steps,28
22,n_splits,26
23,shuffle,14
15,param_grid,13
14,estimator,12
12,feature_range,12
31,max_iter,12


Calculate Value Range for Options

In [126]:
# TODO: Calculate value range

options = df_options["Hyperparameter"]

num_options_value_range = []
most_common_option_values = []

for option in options:
    value_range = []
    for project in projects_files:
    
        # Get project data
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

        # Check each file
        for file in project_data:
            file_data = project_data[file]

            # Extract each ML algorithm
            for module in file_data:    
                module_name = module.split("_")[0]
                module_data = file_data[module]
                for param, value in module_data.items():
                    if option == param:
                        value_range.append(value)

                    #if param == "max_features" and value == "None":
                    #    print(project)
                    #    print(file)

                    #if param == "n_clusters":
                    #    print(module_name)

    option_data = Counter(value_range)
    num_options_value_range.append(option_data)

    most_common_counter = option_data.most_common(1)

    most_common_values = [x for x in most_common_counter]
    most_common_option_values.append(most_common_values)

num_value_range = [len(x) for x in num_options_value_range]

df_options["num_value_range"] = num_value_range
df_options["most_common_values"] = most_common_option_values
df_options.head(10)

Unnamed: 0,Hyperparameter,count,num_value_range,most_common_values
1,random_state,54,12,"[(0, 32)]"
7,n_clusters,36,13,"[(n_clusters, 8)]"
13,n_components,31,14,"[(2, 12)]"
45,steps,28,23,"[(('tokenize', feature_extraction.BagOfWords(l..."
22,n_splits,26,12,"[(n_splits, 8)]"
23,shuffle,14,3,"[(True, 8)]"
15,param_grid,13,5,"[(regressor[1], 6)]"
14,estimator,12,7,"[(regressor[0](), 6)]"
12,feature_range,12,2,"[((0, 1), 9)]"
31,max_iter,12,6,"[(100, 4)]"


Convert Dataframe into Latex Table

In [127]:
print(df_options.to_latex(index=False)) 

\begin{tabular}{lrrl}
\toprule
                 Hyperparameter &  count &  num\_value\_range &                                 most\_common\_values \\
\midrule
                   random\_state &     54 &               12 &                                          [(0, 32)] \\
                     n\_clusters &     36 &               13 &                                  [(n\_clusters, 8)] \\
                   n\_components &     31 &               14 &                                          [(2, 12)] \\
                          steps &     28 &               23 & [(('tokenize', feature\_extraction.BagOfWords(lo... \\
                       n\_splits &     26 &               12 &                                    [(n\_splits, 8)] \\
                        shuffle &     14 &                3 &                                        [(True, 8)] \\
                     param\_grid &     13 &                5 &                                [(regressor[1], 6)] \\
                    

Find projects that use Scikit Learn Hyperparameter Tuning

In [128]:
tuning = ["GridSearchCV", "RandomizedSearchCV", "HalvingGridSearchCV", "HalvingRandomSearchCV"]

projects_with_hpo = set()

for project in projects_files:
    
    # Get project data
    with open(project, "r", encoding="utf-8") as project_file:
        project_data = json.load(project_file)

    # Check each file
    for file in project_data:
        file_data = project_data[file]

        # Extract each ML algorithm
        for module in file_data:    
            module_name = module.split("_")[0]
            module_data = file_data[module]
            if module_name in tuning:
                projects_with_hpo.add(project)
                print(project)
                print(file)

print("Scikit Learn Projects with Hyperparameter Tuning: ", len(projects_with_hpo))
print("Scikit Learn Projects without Hyperparameter Tuning: ", len(projects_with_algorithms) - len(projects_with_hpo))

statistics/sklearn/statistics/statistics_100DaysofMLCodeChallenge.json
Model-Selection-&-Boosting/Model-Selection/grid_search.py
statistics/sklearn/statistics/statistics_Conditional_Density_Estimation.json
cde/density_estimator/BaseDensityEstimator.py
statistics/sklearn/statistics/statistics_lua-ffi-lightGBM.json
examples/python-guide/sklearn_example.py
statistics/sklearn/statistics/statistics_lua-ffi-lightGBM.json
tests/python_package_test/test_sklearn.py
statistics/sklearn/statistics/statistics_regression_data_poisoning.json
src/attacks.py
statistics/sklearn/statistics/statistics_regression_data_poisoning.json
src/main.py
statistics/sklearn/statistics/statistics_regression_data_poisoning.json
src/visualisation/poison_warfarin.py
statistics/sklearn/statistics/statistics_regression_data_poisoning.json
src/visualisation/poison_warfarin.py
statistics/sklearn/statistics/statistics_regression_data_poisoning.json
src/visualisation/poison_warfarin.py
statistics/sklearn/statistics/statistics_