In [6]:
import pandas as pd
import json
import glob
from collections import Counter
from typing import List, Dict
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [7]:
def assign_category(type_name: str) -> str:

    if type_name.lower() in ("str", "joinedstr"):
        return "Text"

    if type_name.lower() in ("int", "float", "complex"):
        return "Numeric"

    if type_name.lower() in ("list", "tuple", "listcomp", "generatorexp"):
        return "Sequence"

    if type_name.lower() in ("dict", "dictcomp", "kwargs"):
        return "Mapping"
    
    if type_name.lower() in ("set", "setcomp"):
        return "Set"

    if type_name.lower() in ("lambda", "subscript"):
        return "Call"

    if type_name.lower() in ("binop", "boolop", "unaryop", "compare", "ifexp"):
        return "Operation"

    if type_name.lower() == "none":
        return "None Type"

    if type_name.lower() in ("method argument", "starred", "variable", "attribute", "yield"):
        return "Variable"
    
    if type_name.lower() in ("bool"):
        return "Bool"

    return type_name

In [11]:
def get_value_types(library_name: str, project_dir: str) -> pd.DataFrame:
    value_types = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                for param, param_data in data.items():
                                    if param in ("variable", "params"):
                                        continue
                                    else:
                                        #if str(param_data["type"]) == "Subscript":
                                        #    for item in param_data["possible_values"]:
                                        #        if "parse_args" in item[0] or "sys.argv" in item[0]:
                                        #            print(project, file)
                                        #           print(param_data["value"], param_data["possible_values"])

                                        value_type = assign_category(str(param_data["type"]))
                                        value_types.append(value_type)


    type_data = Counter(value_types)
    df = pd.DataFrame.from_dict(type_data, orient="index").reset_index()
    df = df.rename(columns={'index':'Type', 0:'Count'})

    return df


df_sklearn = get_value_types("sklearn", "data/statistics/*")
df_tf = get_value_types("tensorflow", "data/statistics/*")
df_torch = get_value_types("torch", "data/statistics/*")
        
#fig = px.pie(df, values="Count", names="Type", title='Test')
#fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='percent', insidetextorientation='radial')])

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_sklearn["Type"], values=df_sklearn["Count"], title="scikit learn"), 1, 1)
fig.add_trace(go.Pie(labels=df_tf["Type"], values=df_tf["Count"], title="Tensorflow"), 1, 2)
fig.add_trace(go.Pie(labels=df_torch["Type"], values=df_torch["Count"], title="Pytorch"), 1, 3)

fig.update_traces(textposition='inside')
fig.update_layout(
    uniformtext_minsize=12, 
    uniformtext_mode='hide',
    #title_text='Types of values passed to parameter of scikit learn, Tensorflow, and Pytorch',
    width=800,
    height=400,
    autosize=False,
    legend_title="Types of Values",
    legend=dict(
        bordercolor="Black",
        borderwidth=1
    )
)
fig.show()
fig.write_image("value_types.pdf")


data/statistics\ieee-dsmp-2018-paper_params.json tfidf_classify.py
data/statistics\UQ360_params.json tests/test_MetamodelClassification.py
data/statistics\UQ360_params.json tests/test_MetamodelClassification.py
data/statistics\UQ360_params.json tests/test_MetamodelRegression.py
data/statistics\UQ360_params.json tests/test_MetamodelRegression.py
data/statistics\UQ360_params.json tests/test_MetamodelRegression.py
data/statistics\UQ360_params.json tests/test_MetamodelRegression.py
data/statistics\UQ360_params.json tests/test_MetamodelRegression.py


In [9]:
def get_most_common_type(values: List) -> str:
    value_types = []

    for item in values:
        value_types.append(item[-1])

    count_data = Counter(value_types)
    return count_data.most_common(1)[0][0]

In [10]:
from collections import OrderedDict 

def get_types_of_variables(library_name: str, project_dir: str):
    variable_types = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                for param, param_data in data.items():
                                    if param in ("variable", "params"):
                                        continue
                                    
                                    value_type = param_data["type"]
                                    if value_type == "variable":
                                        if param_data["possible_values"]:
                                            most_common_type = get_most_common_type(param_data["possible_values"])
                                            variable_types.append(assign_category(most_common_type))
                                        else:
                                            print("Project: ", project)
                                            print("File: ", file)
                                            print("Param: ", param)
                                            print("Value: ", param_data["value"])
                                            variable_types.append("Unknown")


    type_data = Counter(variable_types)
    type_data_portion = OrderedDict([(i, str(round(count / sum(type_data.values()) * 100.0, 3)) + '%') for i, count in type_data.most_common()])
    df = pd.DataFrame.from_dict(type_data, orient="index").reset_index()
    df = df.rename(columns={'index':'Type', 0:'Count'})
    print(type_data_portion)
    return df, type_data_portion


df_sklearn_variables, type_data_sklearn = get_types_of_variables("sklearn", "data/statistics/*")
df_tf_variables, type_data_tf = get_types_of_variables("tensorflow", "data/statistics/*")
df_torch_variables, type_data_torch = get_types_of_variables("torch", "data/statistics/*")

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_sklearn_variables["Type"], values=df_sklearn_variables["Count"], title="scikit learn"), 1, 1)
fig.add_trace(go.Pie(labels=df_tf_variables["Type"], values=df_tf_variables["Count"], title="Tensorflow"), 1, 2)
fig.add_trace(go.Pie(labels=df_torch_variables["Type"], values=df_torch_variables["Count"], title="Pytorch"), 1, 3)

fig.update_traces(textposition='inside')
fig.update_layout(
#    uniformtext_minsize=12, 
    #uniformtext_mode='hide',
    #title_text='Types of values passed to parameter of scikit learn, Tensorflow, and Pytorch',
    width=800,
    height=400,
    autosize=False,
    legend_title="Types of Variable Values",
    legend=dict(
        bordercolor="Black",
        borderwidth=1
    )
)
fig.show()

fig.write_image("variable_types.pdf")

Project:  data/statistics\3PU_params.json
File:  code/utils/pc_util.py
Param:  n_neighbors
Value:  k
Project:  data/statistics\adversarial-logistic_params.json
File:  spam.py
Param:  C
Value:  C_
Project:  data/statistics\aircraft-localization_params.json
File:  winning-entries/round1/3rd Place/calc_sensor_shift2_params.py
Param:  alpha
Value:  ridge_alpha
Project:  data/statistics\An-Information-Retrieval-Approach-to-Building-Datasets-for-Hate-Speech-Detection_params.json
File:  codes/active_learning.py
Param:  analyzer
Value:  text_preprocessing
Project:  data/statistics\An-Information-Retrieval-Approach-to-Building-Datasets-for-Hate-Speech-Detection_params.json
File:  codes/active_learning.py
Param:  solver
Value:  small_data_solver
Project:  data/statistics\An-Information-Retrieval-Approach-to-Building-Datasets-for-Hate-Speech-Detection_params.json
File:  codes/active_learning.py
Param:  C
Value:  small_data_C_parameter
Project:  data/statistics\An-Information-Retrieval-Approach-to