In [55]:
import json
import pandas as pd
import glob
from collections import Counter
from typing import List, Dict

In [56]:
with open("repos.json", "r", encoding="utf-8") as src:
    repos = json.load(src)
    print("Number repos: ", len(repos))

repo_files = set()

for x in glob.glob("../data/statistics/*"):
    file_name = x.split("\\")[-1]
    if file_name in repos:
        repo_files.add(x)

print("Number files: ", len(repo_files))

Number repos:  982
Number files:  982


In [57]:
def df_to_latex(df: pd.DataFrame) -> None:
    print(df.to_latex(index=False))

def get_module(name, data):
    module = next(filter(lambda x: name == x["name"], data))
    return module

In [58]:
def get_classes(library_name: str, library_dir: str, files: List) -> Dict:
    # Get Most used Class
    classes = []

    for project in files:
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, _ in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]
                                classes.append(class_name)

    return Counter(classes).most_common()

classes = get_classes("sklearn", "../modules/sklearn_estimators.json", repo_files)
top_classes = [x[0] for x in classes][:30]
top_classes_count = [x[1] for x in classes][:30]

In [59]:
def get_params(library_name: str, library_dir: str, files, classes) -> List:

    param_data = []
    
    for class_name in classes:
        params_set = []

        for project in list(files):
            with open(project, "r", encoding="utf-8") as project_file:
                project_data = json.load(project_file)

                for file in project_data.keys():
                    file_data = project_data[file]
                    for library in file_data.keys():
                        if library == library_name:
                            module_data = file_data[library]
                            for key, data in module_data.items():
                                if key[0].isupper():
                                    module_name_parts = key.split("_")
                                    if len(module_name_parts) > 2:
                                        module_name = "_".join(module_name_parts[:-1])
                                    else:
                                        module_name = module_name_parts[0]

                                    if class_name == module_name:

                                        for name, _ in data.items():
                                            if name in ("variable", "params"):
                                                continue
                                            else:
                                                params_set.append(name)
                                                        
                                            
        
        param_counter = Counter(params_set).most_common(3)
        #print(param_counter)
        param_data.append(param_counter)
        
    return param_data


params = get_params("sklearn", "../modules/sklearn_estimators.json", repo_files, top_classes)

top_params = []  
top_params_count = []  
for item in params:
    _params = [x[0] for x in item]
    _count = [x[1] for x in item]
    top_params.append(_params)
    top_params_count.append(_count)


In [60]:
df = pd.DataFrame()
df["Algorithm"] = top_classes
df["Algorithm Count"] = top_classes_count
df["Top Hyperparameters"] = top_params
df["Top Hyperparameter Count"] = top_params_count

print(df.to_latex(index=False))

\begin{tabular}{lrll}
\toprule
               Algorithm &  Algorithm Count &                             Top Hyperparameters & Top Hyperparameter Count \\
\midrule
                    TSNE &               54 &                [n\_components, init, perplexity] &             [49, 25, 25] \\
                  KMeans &               49 &                [n\_clusters, init, random\_state] &             [48, 21, 17] \\
          StandardScaler &               47 &                 [with\_mean, with\_std, **kwargs] &               [10, 3, 1] \\
                     PCA &               45 &              [n\_components, whiten, svd\_solver] &               [37, 3, 2] \\
                   KFold &               35 &               [n\_splits, shuffle, random\_state] &             [35, 35, 27] \\
      LogisticRegression &               30 &                       [C, random\_state, solver] &              [16, 12, 7] \\
            LabelEncoder &               29 &                                     

  print(df.to_latex(index=False))
