In [16]:
import pandas as pd
import json
import glob
from collections import Counter
from typing import List, Dict
from collections import OrderedDict
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
def assign_category(type_name: str) -> str:

    if type_name.lower() in ("str", "joinedstr"):
        return "Text"

    if type_name.lower() in ("int", "float", "complex"):
        return "Numeric"

    if type_name.lower() in ("list", "tuple", "listcomp", "generatorexp"):
        return "Sequence"

    if type_name.lower() in ("dict", "dictcomp", "kwargs"):
        return "Mapping"
    
    if type_name.lower() in ("set", "setcomp"):
        return "Set"

    if type_name.lower() in ("lambda", "subscript"):
        return "Call"

    if type_name.lower() in ("binop", "boolop", "unaryop", "compare", "ifexp"):
        return "Operation"

    if type_name.lower() == "none":
        return "None Type"

    if type_name.lower() in ("method argument", "starred", "variable", "attribute", "yield"):
        return "Variable"
    
    if type_name.lower() in ("bool"):
        return "Bool"

    return type_name

In [18]:
def get_value_types_all_methods(library_name: str, project_dir: str) -> pd.DataFrame:
    value_types = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                for param, param_data in data.items():
                                    if param in ("variable", "params"):
                                        continue
                                    else:
                                        #if str(param_data["type"]) == "Subscript":
                                        #    for item in param_data["possible_values"]:
                                        #        if "parse_args" in item[0] or "sys.argv" in item[0]:
                                        #            print(project, file)
                                        #           print(param_data["value"], param_data["possible_values"])

                                        value_type = assign_category(str(param_data["type"]))
                                        value_types.append(value_type)


    type_data = Counter(value_types)
    type_data_portion = OrderedDict([(i, str(round(count / sum(type_data.values()) * 100.0, 1)) + '%') for i, count in type_data.most_common()])
    portion = [y for _, y in type_data_portion.items()]
    df = pd.DataFrame.from_dict(type_data, orient="index").reset_index()
    df = df.rename(columns={'index':'Type', 0:'Count'})
    df = df.sort_values(by=['Count'], ascending=False)
    df["portion"] = portion
    return df


df_sklearn = get_value_types_all_methods("sklearn", "../data/statistics/*")
df_tf = get_value_types_all_methods("tensorflow", "../data/statistics/*")
df_torch = get_value_types_all_methods("torch", "../data/statistics/*")
        
print(df_sklearn.to_latex())
print(df_tf.to_latex())
print(df_torch.to_latex())

\begin{tabular}{llrl}
\toprule
{} &       Type &  Count & portion \\
\midrule
0 &   Variable &   1510 &   33.5\% \\
1 &    Numeric &   1414 &   31.4\% \\
2 &       Text &    574 &   12.7\% \\
5 &       Bool &    398 &    8.8\% \\
3 &       Call &    237 &    5.3\% \\
4 &  Operation &    134 &    3.0\% \\
6 &   Sequence &    121 &    2.7\% \\
7 &   NoneType &     67 &    1.5\% \\
8 &    Mapping &     53 &    1.2\% \\
\bottomrule
\end{tabular}

\begin{tabular}{llrl}
\toprule
{} &       Type &  Count & portion \\
\midrule
0 &   Variable &  13237 &   35.1\% \\
2 &       Text &   9113 &   24.2\% \\
1 &    Numeric &   4531 &   12.0\% \\
4 &       Call &   4003 &   10.6\% \\
5 &   Sequence &   3133 &    8.3\% \\
3 &       Bool &   2212 &    5.9\% \\
7 &  Operation &    771 &    2.0\% \\
8 &   NoneType &    539 &    1.4\% \\
6 &    Mapping &    190 &    0.5\% \\
\bottomrule
\end{tabular}

\begin{tabular}{llrl}
\toprule
{} &       Type &  Count & portion \\
\midrule
1 &   Variable &  81679 &   

In [19]:
def get_value_types_ml_methods(library_name: str, library_dir: str ,project_dir: str) -> pd.DataFrame:

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [x["name"] for x in library_data]
    
    value_types = []

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]

                                if class_name not in class_names:
                                    continue    

                                for param, param_data in data.items():
                                    if param in ("variable", "params"):
                                        continue
                                    else:
                                        #if str(param_data["type"]) == "Subscript":
                                        #    for item in param_data["possible_values"]:
                                        #        if "parse_args" in item[0] or "sys.argv" in item[0]:
                                        #            print(project, file)
                                        #           print(param_data["value"], param_data["possible_values"])

                                        value_type = assign_category(str(param_data["type"]))
                                        value_types.append(value_type)


    type_data = Counter(value_types)
    type_data_portion = OrderedDict([(i, str(round(count / sum(type_data.values()) * 100.0, 2)) + '%') for i, count in type_data.most_common()])
    portion = [y for _, y in type_data_portion.items()]
    df = pd.DataFrame.from_dict(type_data, orient="index").reset_index()
    df = df.rename(columns={'index':'Type', 0:'Count'})
    df = df.sort_values(by=['Count'], ascending=False)
    df["portion"] = portion
    return df


df_sklearn = get_value_types_ml_methods("sklearn", "../data/library_data/sklearn_estimators.json", "../data/statistics/*")
df_tf = get_value_types_ml_methods("tensorflow", "../data/library_data/tensorflow_optimizer.json", "../data/statistics/*")
df_torch = get_value_types_ml_methods("torch", "../data/library_data/torch_optimizer.json", "../data/statistics/*")
        
print(df_sklearn.to_latex())
print(df_tf.to_latex())
print(df_torch.to_latex())

\begin{tabular}{llrl}
\toprule
{} &       Type &  Count & portion \\
\midrule
2 &    Numeric &    678 &  34.04\% \\
0 &   Variable &    601 &  30.17\% \\
1 &       Text &    335 &  16.82\% \\
4 &       Bool &    135 &   6.78\% \\
3 &       Call &     79 &   3.97\% \\
5 &  Operation &     64 &   3.21\% \\
6 &   NoneType &     51 &   2.56\% \\
8 &    Mapping &     34 &   1.71\% \\
7 &   Sequence &     15 &   0.75\% \\
\bottomrule
\end{tabular}

\begin{tabular}{llrl}
\toprule
{} &       Type &  Count & portion \\
\midrule
1 &    Numeric &    196 &  47.23\% \\
0 &   Variable &    160 &  38.55\% \\
3 &       Call &     33 &   7.95\% \\
2 &       Bool &     20 &   4.82\% \\
4 &  Operation &      4 &   0.96\% \\
5 &   NoneType &      2 &   0.48\% \\
\bottomrule
\end{tabular}

\begin{tabular}{llrl}
\toprule
{} &       Type &  Count & portion \\
\midrule
0 &   Variable &   3879 &  57.42\% \\
2 &    Numeric &   1474 &  21.82\% \\
4 &   Sequence &    641 &   9.49\% \\
3 &       Call &    467 &   

In [20]:
def get_most_common_type(values: List) -> str:
    value_types = []

    for item in values:
        value_types.append(item[-1])

    count_data = Counter(value_types)
    most_common = count_data.most_common(1)
    counter = 0
    for value in count_data:
        if count_data[value] == most_common[0][1]:
            counter +=1

    if counter > 1:
        return "Multiple Types"
    else:
        return count_data.most_common(1)[0][0]

In [21]:
from collections import OrderedDict 

def get_types_of_variables_ml_methods(library_name: str, library_dir: str ,project_dir: str):
    variable_types = []

    with open(library_dir, "r", encoding="utf-8") as library_file:
        library_data = json.load(library_file)
        class_names = [x["name"] for x in library_data]

    for project in glob.glob(project_dir):
        with open(project, "r", encoding="utf-8") as project_file:
            project_data = json.load(project_file)

            for file in project_data.keys():
                file_data = project_data[file]
                for library in file_data.keys():
                    if library == library_name:
                        module_data = file_data[library]
                        for key, data in module_data.items():
                            if key[0].isupper():
                                class_name_parts = key.split("_")
                                if len(class_name_parts) > 2:
                                    class_name = "_".join(class_name_parts[:-1])
                                else:
                                    class_name = class_name_parts[0]

                                if class_name not in class_names:
                                    continue    
                                
                                for param, param_data in data.items():
                                    if param in ("variable", "params"):
                                        continue
                                    
                                    value_type = param_data["type"]
                                    if value_type == "variable":
                                        if param_data["possible_values"]:
                                            most_common_type = get_most_common_type(param_data["possible_values"])
                                            variable_types.append(assign_category(most_common_type))
                                        else:
                                            variable_types.append("Unknown")


    type_data = Counter(variable_types)
    type_data_portion = OrderedDict([(i, str(round(count / sum(type_data.values()) * 100.0, 3)) + '%') for i, count in type_data.most_common()])
    portion = [y for _, y in type_data_portion.items()]
    df = pd.DataFrame.from_dict(type_data, orient="index").reset_index()
    df = df.rename(columns={'index':'Type', 0:'Count'})
    df = df.sort_values(by=['Count'], ascending=False)
    df["portion"] = portion

    return df

df_sklearn_variables = get_types_of_variables_ml_methods("sklearn", "../data/library_data/sklearn_estimators.json", "../data/statistics/*")
df_tf_variables = get_types_of_variables_ml_methods("tensorflow", "../data/library_data/tensorflow_optimizer.json", "../data/statistics/*")
df_torch_variables = get_types_of_variables_ml_methods("torch", "../data/library_data/torch_optimizer.json", "../data/statistics/*")

print(df_sklearn_variables.to_latex())
print(df_tf_variables.to_latex())
print(df_torch_variables.to_latex())

\begin{tabular}{llrl}
\toprule
{} &            Type &  Count &  portion \\
\midrule
0 &        Variable &    157 &  37.381\% \\
4 &            Call &     86 &  20.476\% \\
1 &         Numeric &     62 &  14.762\% \\
3 &  Multiple Types &     39 &   9.286\% \\
2 &       Operation &     35 &   8.333\% \\
5 &         Unknown &     29 &   6.905\% \\
6 &        Sequence &     10 &   2.381\% \\
7 &            Text &      2 &   0.476\% \\
\bottomrule
\end{tabular}

\begin{tabular}{llrl}
\toprule
{} &            Type &  Count &  portion \\
\midrule
0 &         Numeric &     43 &  35.537\% \\
2 &        Variable &     26 &  21.488\% \\
3 &  Multiple Types &     26 &  21.488\% \\
1 &            Call &     13 &  10.744\% \\
4 &         Unknown &     10 &   8.264\% \\
5 &       Operation &      2 &   1.653\% \\
6 &        Sequence &      1 &   0.826\% \\
\bottomrule
\end{tabular}

\begin{tabular}{llrl}
\toprule
{} &            Type &  Count &  portion \\
\midrule
2 &        Variable &    542 &  42