In [1]:
import pandas as pd
import json
import glob
from collections import Counter
from typing import List, Dict
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
def assign_category(type_name: str) -> str:

    if type_name.lower() in ("str", "joinedstr"):
        return "Text"

    if type_name.lower() in ("int", "float", "complex"):
        return "Numeric"

    if type_name.lower() in ("list", "tuple", "listcomp", "generatorexp"):
        return "Sequence"

    if type_name.lower() in ("dict", "dictcomp", "kwargs"):
        return "Mapping"
    
    if type_name.lower() in ("set", "setcomp"):
        return "Set"

    if type_name.lower() in ("lambda", "subscript"):
        return "Call"

    if type_name.lower() in ("binop", "boolop", "unaryop", "compare", "ifexp"):
        return "Operation"

    if type_name.lower() == "none":
        return "None Type"

    if type_name.lower() in ("methodargument", "starred", "variable", "attribute"):
        return "Variable"
    
    if type_name.lower() in ("bool"):
        return "Bool"

    return type_name

In [3]:
def get_value_types(library_name: str, project_dir: str) -> pd.DataFrame:
    value_types = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                for param, param_data in data.items():
                                    if param in ("variable", "params"):
                                        continue
                                    else:
                                        value_type = assign_category(str(param_data["type"]))
                                        value_types.append(value_type)


    type_data = Counter(value_types)
    df = pd.DataFrame.from_dict(type_data, orient="index").reset_index()
    df = df.rename(columns={'index':'Type', 0:'Count'})

    return df


df_sklearn = get_value_types("sklearn", "data/statistics/*")
df_tf = get_value_types("tensorflow", "data/statistics/*")
df_torch = get_value_types("torch", "data/statistics/*")
        
#fig = px.pie(df, values="Count", names="Type", title='Test')
#fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='percent', insidetextorientation='radial')])

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_sklearn["Type"], values=df_sklearn["Count"], title="scikit learn"), 1, 1)
fig.add_trace(go.Pie(labels=df_tf["Type"], values=df_tf["Count"], title="Tensorflow"), 1, 2)
fig.add_trace(go.Pie(labels=df_torch["Type"], values=df_torch["Count"], title="Pytorch"), 1, 3)

fig.update_traces(textposition='inside')
fig.update_layout(
    uniformtext_minsize=12, 
    uniformtext_mode='hide',
    #title_text='Types of values passed to parameter of scikit learn, Tensorflow, and Pytorch',
    width=800,
    height=400,
    autosize=False,
    legend_title="Types of Values",
    legend=dict(
        bordercolor="Black",
        borderwidth=1
    )
)
fig.show()
fig.write_image("value_types.png")


In [4]:
def get_most_common_type(values: List) -> str:
    value_types = []

    for item in values:
        value_types.append(item[-1])

    count_data = Counter(value_types)
    return count_data.most_common(1)[0][0]

In [5]:
def get_types_of_variables(library_name: str, project_dir: str):
    variable_types = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                for param, param_data in data.items():
                                    if param in ("variable", "params"):
                                        continue
                                    
                                    value_type = assign_category(str(param_data["type"]))
                                    if value_type == "Variable":
                                        if param_data["possible_values"]:
                                            most_common_type = get_most_common_type(param_data["possible_values"])
                                            variable_types.append(assign_category(most_common_type))
                                        else:
                                            variable_types.append("Unknown")


    type_data = Counter(variable_types)
    print(type_data)
    df = pd.DataFrame.from_dict(type_data, orient="index").reset_index()
    df = df.rename(columns={'index':'Type', 0:'Count'})

    return df


df_sklearn_variables = get_types_of_variables("sklearn", "data/statistics/*")
df_tf_variables = get_types_of_variables("tensorflow", "data/statistics/*")
df_torch_variables = get_types_of_variables("torch", "data/statistics/*")

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_sklearn_variables["Type"], values=df_sklearn_variables["Count"], title="scikit learn"), 1, 1)
fig.add_trace(go.Pie(labels=df_tf_variables["Type"], values=df_tf_variables["Count"], title="Tensorflow"), 1, 2)
fig.add_trace(go.Pie(labels=df_torch_variables["Type"], values=df_torch_variables["Count"], title="Pytorch"), 1, 3)

fig.update_traces(textposition='inside')
fig.update_layout(
    uniformtext_minsize=12, 
    #uniformtext_mode='hide',
    #title_text='Types of values passed to parameter of scikit learn, Tensorflow, and Pytorch',
    #width=1000,
    #height=700,
    #autosize=False,
    legend_title="Types of Variable Values",
    legend=dict(
        bordercolor="Black",
        borderwidth=1
    )
)
fig.show()
fig.write_image("variable_types.pdf")

Counter({'Unknown': 141, 'Variable': 120, 'Call': 82, 'Constant': 37, 'Mapping': 15, 'Name': 14, 'Operation': 10, 'Sequence': 10})
Counter({'Unknown': 4702, 'Call': 1911, 'Variable': 653, 'Operation': 245, 'Constant': 237, 'Sequence': 148, 'Mapping': 73, 'Name': 68, 'Text': 3})
Counter({'Unknown': 37211, 'Variable': 9839, 'Call': 6958, 'Operation': 2635, 'Constant': 1625, 'Name': 793, 'Sequence': 515, 'Mapping': 8, 'Text': 3})


In [6]:
#df_sklearn_variables = get_types_of_variables("sklearn", "data/statistics/*")
#df_tf_variables = get_types_of_variables("tensorflow", "data/statistics/*")
#df_torch_variables = get_types_of_variables("torch", "data/statistics/*")


df_sklearn_variables.to_latex(index=False)



'\\begin{tabular}{lr}\n\\toprule\n     Type &  Count \\\\\n\\midrule\n  Unknown &    141 \\\\\n Variable &    120 \\\\\n     Call &     82 \\\\\nOperation &     10 \\\\\n Constant &     37 \\\\\n     Name &     14 \\\\\n  Mapping &     15 \\\\\n Sequence &     10 \\\\\n\\bottomrule\n\\end{tabular}\n'