In [None]:
# default_exp evaluation.core

In [None]:
# hide
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

# Evaluation

> API details.

In [None]:
# export
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import Counter, defaultdict
from icodegen.data.transforms import (
    code_token_randomizer,
    line_randomizer,
    java_comment_remover,
    transform_df,
)
from icodegen.model.core import Model, RNNModel
from pathlib import Path
from scipy import stats
from typing import Dict, List, Optional

In [None]:
# hide
# Setting up testing data
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
from icodegen.model.core import RNNModel

# Using tiny-gpt2 for just quick tests since it is... tiny :)
trnsfr_tokenizer = GPT2TokenizerFast.from_pretrained("sshleifer/tiny-gpt2")
tokenizer = trnsfr_tokenizer.backend_tokenizer
trnsfr = TFGPT2LMHeadModel.from_pretrained("sshleifer/tiny-gpt2")
# trnsfr_model = TransformerModel(tokenizer, trnsfr)

rnn_type = "gru"
n_layers = 1
vocab_size = tokenizer.get_vocab_size()
embedding_dim = 128
rnn_units = 128
batch_size = 1
out_path = "/tmp"
gru_model = RNNModel(
    rnn_type,
    n_layers,
    vocab_size,
    embedding_dim,
    rnn_units,
    batch_size,
    out_path,
    tokenizer,
)

df_fake = pd.DataFrame(
    ["aaaa(bb(aaaa(bb()()ccc)dd)()ccc)dd", "aaaa(bb()ccccc)dd"], columns=["code"]
)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at sshleifer/tiny-gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
# export
def get_mean_probs(df: pd.DataFrame, model: Model, n: Optional[int] = None):
    """
    Get the mean probability of each token that the model
    should predict for an entire pandas dataframe.

    :param df: the pandas dataframe containing each method to have the model predict on
    :param model: the model used to generate the predictions
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :returns: returns a numpy array of the mean probability for each token in the model's vocab
    """
    if n is None:
        n = len(df)

    # setup container lists for the number of occurrences and sum of probabilities for each token
    counts = [0] * model.tokenizer.get_vocab_size()
    sum_probs = [0.0] * model.tokenizer.get_vocab_size()
    # loop through each method
    for mthd in df.code.values[:n]:
        # token the method and generate the probabilities for the model's predictions
        inputs = model.tokenize(mthd)
        probs = model.get_probs(inputs)[0].numpy()

        # loop through each token and its probability and update the container lists
        for idx, p in zip(inputs["input_ids"][0], probs):
            counts[idx] += 1
            sum_probs[idx] += p[idx]

    # convert the lists to numpy lists and perform element wise division to get the mean probabilities for each token
    counts = np.array(counts)
    sum_probs = np.array(sum_probs)

    # perform division, but not when denominator is zero. In those cases, just leave value as NAN.
    nans = np.empty(counts.shape)
    nans.fill(np.nan)
    mean_probs = np.divide(sum_probs, counts, out=nans, where=counts != 0)
    # TODO: convert to dictionary with keys as tokens
    mean_probs = {
        model.tokenizer.id_to_token(i): mean_probs[i] for i in range(len(mean_probs))
    }
    return mean_probs

In [None]:
NON_NAN_PROBS_MEAN = np.array(
    [
        2.01237513e-05,
        1.98944481e-05,
        2.01449202e-05,
        2.04353437e-05,
        2.02043060e-05,
        2.02826177e-05,
        2.09888076e-05,
        2.07051467e-05,
        1.98100976e-05,
        2.02152678e-05,
        2.02035244e-05,
        2.10283021e-05,
    ]
)

mean_probs = np.array(list(get_mean_probs(df_fake, trnsfr_model).values()))
non_nan_idx = np.argwhere(~np.isnan(mean_probs)).flatten()
non_nan_mean_prob = mean_probs[non_nan_idx]

assert np.isclose(non_nan_mean_prob, NON_NAN_PROBS_MEAN, atol=1.0e-6).all()

In [None]:
NON_NAN_PROBS_MEAN = np.array(
    [
        1.99270412e-05,
        1.99168703e-05,
        1.98815596e-05,
        1.99057849e-05,
        1.98800869e-05,
        1.98893995e-05,
        1.98797388e-05,
        1.98960342e-05,
        1.99086674e-05,
        1.98605580e-05,
        1.98807957e-05,
        1.98842057e-05,
    ]
)

mean_probs = np.array(list(get_mean_probs(df_fake, gru_model).values()))
non_nan_idx = np.argwhere(~np.isnan(mean_probs)).flatten()
non_nan_mean_prob = mean_probs[non_nan_idx]

assert np.isclose(non_nan_mean_prob, NON_NAN_PROBS_MEAN, atol=1.0e-6).all()

In [None]:
# export
def find_parens(toks: List[str], opening: str, closing: str) -> Dict[int, int]:
    """
    Get the indices for the opening and closing tokens.
    From https://stackoverflow.com/a/29992065/5768407
    by user Baltasarq (https://stackoverflow.com/users/266978/baltasarq).

    :param toks: the tokenized version of a method
    :param opening: the opening token that will be matched against the closing token
    :param closing: the closing token that will be matched against the opening token
    :returns: returns a dictionary with the opening token indices as the keys and the closing token indices as the values
    """
    toret = {}
    pstack = []

    for i, tok in enumerate(toks):
        if tok == opening:
            pstack.append(i)
        elif tok == closing:
            if len(pstack) == 0:
                raise IndexError("No matching closing parens at: " + str(i))
            toret[pstack.pop()] = i

    if len(pstack) > 0:
        raise IndexError("No matching opening parens at: " + str(pstack.pop()))

    return toret


def _get_dist_probs(
    mthd: str, model: Model, opening: str, closing: str
) -> Dict[int, float]:
    """
    Get the distances and mean probabilities between opening and closing tokens in a given method.

    :param mthd: the method to get the ranges of the opening and closing tokens and their probabilities
    :param model: the model used to generate the predictions
    :param opening: the opening token used for calculating the distance between opening and closing tokens
    :param closing: the closing token used for calculating the distance between opening and closing tokens as well as the token to get the mean probability of
    :returns: returns a dictionary with the distance between the opening and closing tokens as keys and their mean probabilities as values
    """
    # WARNING: Careful when using different tokenizers since HF tokenizers lib have diff API then HF transformers lib tokenizers... You will need to update this when using custom model and tokenizer...

    # get the distances for the opening and closing tokens
    toks = model.tokenizer.encode(mthd).tokens
    idxs = find_parens(toks, opening, closing)

    # get the model probabilities for the given method
    inputs = model.tokenize(mthd)
    probs = model.get_probs(inputs)[0].numpy()

    # sum up the probabilities of the different distances for the closing token
    dist_probs = defaultdict(float)
    for open_id, close_id in idxs.items():
        dist_probs[close_id - open_id] += probs[close_id][
            inputs["input_ids"][0][close_id]
        ]

    # get the mean of the summed probabilities
    dist_cnts = Counter([close_id - open_id for open_id, close_id in idxs.items()])
    dist_probs = {dist: dist_probs[dist] / n for dist, n in dist_cnts.items()}
    return dist_probs


def mean_dist_probs(
    df: pd.DataFrame,
    model: Model,
    opening: Optional[str] = "<{>",
    closing: Optional[str] = "<}>",
    n: Optional[int] = None,
) -> pd.DataFrame:
    """
    Get the distance between opening and closing tokens and the mean probability of each closing token that the model should predict for an entire pandas dataframe.

    :param df: the pandas dataframe containing each method to have the model predict on
    :param model: the model used to generate the predictions
    :param opening: the opening token used for calculating the distance between opening and closing tokens
    :param closing: the closing token used for calculating the distance between opening and closing tokens as well as the token to get the mean probability of
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :returns: returns a dataframe with the distances between opening and closing tokens and their mean probabilities
    """
    if n is None:
        n = len(df)

    # get the probabilities for the different distances for an entire dataframe
    df = df.iloc[:n].copy()
    dist_probs = df.code.apply(
        lambda mthd: _get_dist_probs(mthd, model, opening, closing)
    ).values

    # flatten the keys of the different distances into a list
    dist_keys = []
    for probs in dist_probs:
        dist_keys.extend(probs.keys())
    # merge dictionaries across methods by taking the mean of probs with the same distance. Modified from https://stackoverflow.com/a/10461916/5768407,
    # users georg https://stackoverflow.com/users/989121/georg and Rémy Hosseinkhan Boucher https://stackoverflow.com/users/12149730/r%c3%a9my-hosseinkhan-boucher
    mean_dist_probs = {
        k: np.nanmean(np.array([probs.get(k, np.nan) for probs in dist_probs]))
        for k in set(dist_keys)
    }
    std_dist_probs = {
        k: np.nanstd(np.array([probs.get(k, np.nan) for probs in dist_probs]))
        for k in set(dist_keys)
    }

    med_dist_probs = {
        k: np.nanmedian(np.array([probs.get(k, np.nan) for probs in dist_probs]))
        for k in set(dist_keys)
    }
    mad_dist_probs = {
        k: stats.median_abs_deviation(
            np.array([probs.get(k, np.nan) for probs in dist_probs]), nan_policy="omit"
        )
        for k in set(dist_keys)
    }
    # TODO: convert to dictionary
    df_dist = (
        pd.DataFrame(
            {
                "dist": list(mean_dist_probs.keys()),
                "mean_prob": list(mean_dist_probs.values()),
                "std_prob": list(std_dist_probs.values()),
                "med_prob": list(med_dist_probs.values()),
                "mad_prob": list(mad_dist_probs.values()),
            }
        )
        .sort_values("dist")
        .reset_index(drop=True)
    )
    return df_dist

In [None]:
DIST_DF = pd.DataFrame(
    {
        "dist": [6, 10, 16],
        "mean_prob": [
            1.98822217e-05,
            1.97613608e-05,
            1.97816771e-05,
        ],
        "std_prob": [
            4.93400876e-09,
            0.00000000e00,
            0.00000000e00,
        ],
        "med_prob": [
            2.04683793e-05,
            2.07205376e-05,
            1.97817026e-05,
        ],
        "mad_prob": [
            4.93400876e-09,
            0.00000000e00,
            0.00000000e00,
        ],
    }
)
df_dist = mean_dist_probs(df_fake, gru_model, opening="(", closing=")")

assert (DIST_DF.dist.values == df_dist.dist.values).all()
assert np.isclose(DIST_DF.mean_prob.values, df_dist.mean_prob.values, atol=1.0e-6).all()
assert np.isclose(DIST_DF.std_prob.values, df_dist.std_prob.values, atol=1.0e-6).all()
assert np.isclose(DIST_DF.med_prob.values, df_dist.med_prob.values, atol=1.0e-6).all()
assert np.isclose(DIST_DF.mad_prob.values, df_dist.mad_prob.values, atol=1.0e-6).all()

In [None]:
DIST_DF = pd.DataFrame(
    {
        "dist": [6, 10, 16],
        "mean_prob": [
            1.98822217e-05,
            1.97613608e-05,
            1.97816771e-05,
        ],
        "std_prob": [
            4.93400876e-09,
            0.00000000e00,
            0.00000000e00,
        ],
        "med_prob": [
            2.04683793e-05,
            2.07205376e-05,
            1.97817026e-05,
        ],
        "mad_prob": [
            4.93400876e-09,
            0.00000000e00,
            0.00000000e00,
        ],
    }
)
df_dist = mean_dist_probs(df_fake, trnsfr_model, opening="(", closing=")")

assert (DIST_DF.dist.values == df_dist.dist.values).all()
assert np.isclose(DIST_DF.mean_prob.values, df_dist.mean_prob.values, atol=1.0e-6).all()
assert np.isclose(DIST_DF.std_prob.values, df_dist.std_prob.values, atol=1.0e-6).all()
assert np.isclose(DIST_DF.med_prob.values, df_dist.med_prob.values, atol=1.0e-6).all()
assert np.isclose(DIST_DF.mad_prob.values, df_dist.mad_prob.values, atol=1.0e-6).all()

In [None]:
# export
token_taxonomy = {
  "blocks": {
    "<{>": "{",
    "<}>": "}",
    "<[>": "[",
    "<]>": "]",
    "<(>": "(",
    "<)>": ")",
    "<;>": ";",
    "<return>": "return"
  },
  "exceptions": {
    "<catch>": "catch",
    "<try>": "try",
    "<finally>": "finally",
    "<throw>": "throw",
    "<throws>": "throws"
  },
  "oop": {
    "<class>": "class",
    "<instanceof>": "instanceof",
    "<interface>": "interface",
    "<private>": "private",
    "<protected>": "protected",
    "<public>": "public",
    "<abstract>": "abstract",
    "<extends>": "extends",
    "<package>": "package",
    "<this>": "this",
    "<implements>": "implements",
    "<import>": "import",
    "<new>": "new",
    "<super>": "super"
  },
  "tests": {
    "<assert>": "assert"
  },
  "declarations": {
    "<native>": "native",
    "<static>": "static",
    "<synchronized>": "synchronized",
    "<transient>": "transient",
    "<volatile>": "volatile",
    "<void>": "void",
    "<final>": "final",
    "<enum>": "enum"
  },
  "conditionals": {
    "<else>": "else",
    "<if>": "if",
    "<switch>": "switch",
    "<case>": "case",
    "<default>": "default"
  },
  "loops": {
    "<break>": "break",
    "<do>": "do",
    "<for>": "for",
    "<while>": "while",
    "<continue>": "continue"
  },
  "operators": {
    "<=>": "=",
    "<+>": "+",
    "<->": "-",
    "<*>": "*",
    "</>": "/",
    "<%>": "%",
    "<++>": "++",
    "<-->": "--",
    "<!>": "!",
    "<==>": "==",
    "<!=>": "!=",
    "<greater_equal>": ">=",
    "<lesser_equal>": "<=",
    "<&&>": "&&",
    "<||>": "||",
    "<?>": "?",
    "<:>": ":",
    "<~>": "~",
    "<double_lesser>": "<<",
    "<double_greater>": ">>",
    "<triple_greater>": ">>>",
    "<&>": "&",
    "<^>": "^",
    "<|>": "|"
  },
  "datatypes": {
    "<byte>": "byte",
    "<char>": "char",
    "<float>": "float",
    "<boolean>": "boolean",
    "<double>": "double",
    "<int>": "int",
    "<long>": "long",
    "<short>": "short",
    "<strictfp>": "strictfp"
  },
  "extra_tokens": {
    "<@>": "@",
    "<...>": "...",
    "<null>": "null",
    "<true>": "true",
    "<false>": "false",
    "<n>": "\n"
  }
}

In [None]:
# export
ERROR_THRESHOLD = 0.5

def get_error_rates(df: pd.DataFrame, model: Model, n: Optional[int] = None):
    if n is None:
        n = len(df)

    # setup container lists for the number of occurrences and sum of probabilities for each token
    cnts = [0] * model.tokenizer.get_vocab_size()
    err_cnts = [0] * model.tokenizer.get_vocab_size()
    # loop through each method
    for mthd in df.code.values[:n]:
        # token the method and generate the probabilities for the model's predictions
        inputs = model.tokenize(mthd)
        probs = model.get_probs(inputs)[0].numpy()

        # loop through each token and its probability and update the container lists
        for idx, p in zip(inputs["input_ids"][0], probs):
            cnts[idx] += 1
            if p[idx] < ERROR_THRESHOLD:
                err_cnts[idx] += 1

    # convert the lists to numpy lists and perform element wise division to get the mean probabilities for each token
    cnts = np.array(cnts)
    err_cnts = np.array(err_cnts)

    # perform division, but not when denominator is zero. In those cases, just leave value as NAN.
    nans = np.empty(cnts.shape)
    nans.fill(np.nan)
    mean_errs = np.divide(err_cnts, cnts, out=nans, where=cnts != 0)
    
    error_taxonomy = token_taxonomy.copy()
    
    for cat, tokens in error_taxonomy.items():
        errs = []
        cnt_sum = 0
        for token, keyword in tokens.items():
            idx = model.tokenizer.token_to_id(token)
            error_taxonomy[cat][token] = {"error_rate": mean_errs[idx], "count": cnts[idx]}
            errs.append(mean_errs[idx])
            cnt_sum += cnts[idx]

        errs = np.array(errs)
        error_taxonomy[cat]["stats"] = {
            "mean_error_rate": np.nanmean(errs),
            "stdev_error_rate": np.nanstd(errs),
            "median_error_rate": np.nanmedian(errs),
            "mad_error_rate": stats.median_abs_deviation(errs, nan_policy="omit"),
        }
    
    return error_taxonomy

In [None]:
bugfix_path = Path("/home/jovyan/work/dvc-icodegen/datasets/controlled/testbeds/_ts_bug_fix")
df_buggy = pd.read_json(bugfix_path / "buggy.jsonl", orient="records", lines=True)[
    :10
]
model = RNNModel.from_path("/home/jovyan/work/dvc-icodegen/models/gru_layers1_vocab10000_embed256_units512")
err_tax = get_error_rates(df_buggy, model)
err_tax

{'blocks': {'<{>': {'error_rate': 1.0, 'count': 27},
  '<}>': {'error_rate': 1.0, 'count': 27},
  '<[>': {'error_rate': 1.0, 'count': 3},
  '<]>': {'error_rate': 1.0, 'count': 3},
  '<(>': {'error_rate': 1.0, 'count': 81},
  '<)>': {'error_rate': 0.8888888888888888, 'count': 81},
  '<;>': {'error_rate': 1.0, 'count': 53},
  '<return>': {'error_rate': 1.0, 'count': 10},
  'stats': {'mean_error_rate': 0.9861111111111112,
   'stdev_error_rate': 0.03674654598700822,
   'median_error_rate': 1.0,
   'mad_error_rate': 0.0}},
 'exceptions': {'<catch>': {'error_rate': 1.0, 'count': 1},
  '<try>': {'error_rate': 1.0, 'count': 1},
  '<finally>': {'error_rate': nan, 'count': 0},
  '<throw>': {'error_rate': nan, 'count': 0},
  '<throws>': {'error_rate': 1.0, 'count': 1},
  'stats': {'mean_error_rate': 1.0,
   'stdev_error_rate': 0.0,
   'median_error_rate': 1.0,
   'mad_error_rate': 0.0}},
 'oop': {'<class>': {'error_rate': 1.0, 'count': 1},
  '<instanceof>': {'error_rate': nan, 'count': 0},
  '<in

In [None]:
# export
ERROR_THRESHOLD = 0.5

def get_error_rates_df(df: pd.DataFrame, model: Model, bs: int = 16, n: Optional[int] = None):
    if n is None:
        n = len(df)

    # setup container lists for the number of occurrences and sum of probabilities for each token
    rows = []
    # loop through each method
    for mthd in df.code.values[:n]:
        # token the method and generate the probabilities for the model's predictions
        inputs = model.tokenize(mthd)
        probs = model.get_probs(inputs)[0].numpy()

        row = {k: [0] * model.tokenizer.get_vocab_size() for k in token_taxonomy.keys()}
        # loop through each token and its probability and update the container lists
        for idx, p in zip(inputs["input_ids"][0], probs):
            if p[idx] < ERROR_THRESHOLD:
                tok = model.tokenizer.id_to_token(idx)
                for k in token_taxonomy:
                    if tok in token_taxonomy[k]:
                        row[k][idx] += 1
        
        for k in row:
            row[k] = np.mean(row[k])
        
        rows.append(row)
        
#     for i in range(0, n, bs):
#         batch = ["<sos>" + mthd for mthd in df.code.values[i:i + bs]]
#         # token the method and get the probabilities for each token from the model
#         inputs = tf.stack([x.ids for x in model.tokenizer.encode_batch(batch)], axis = 0)
#         logits = model.model(inputs)
#         probs = tf.nn.softmax(logits).numpy()
        
#         for i in range(len(batch)):
#             row = {k: [0] * model.tokenizer.get_vocab_size() for k in token_taxonomy.keys()}
#             # loop through each token and its probability and update the container lists
#             for idx, p in zip(inputs[i], probs[i]):
#                 if p[idx] < ERROR_THRESHOLD:
#                     tok = model.tokenizer.id_to_token(idx)
#                     for k in token_taxonomy:
#                         if tok in token_taxonomy[k]:
#                             row[k][idx] += 1

#             for k in row:
#                 row[k] = np.mean(row[k])

#             rows.append(row)
        
    error_df = pd.DataFrame(rows)
    error_df["code"] = df.code.values[:n]
    
    return error_df

In [None]:
bugfix_path = Path("/home/jovyan/work/dvc-icodegen/datasets/controlled/testbeds/_ts_bug_fix")
df_buggy = pd.read_json(bugfix_path / "buggy.jsonl", orient="records", lines=True)[
    :100
]
model = RNNModel.from_path("/home/jovyan/work/dvc-icodegen/models/controlled/rnns/rnn_layers1_vocab10000_embed256_units1024")

In [None]:
err_tax = get_error_rates_df(df_buggy, model)
err_tax.describe()

Unnamed: 0,blocks,exceptions,oop,tests,declarations,conditionals,loops,operators,datatypes,extra_tokens
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.002949,5.2e-05,0.000232,1.3e-05,8.7e-05,0.000141,7e-05,0.000415,0.000124,0.00106
std,0.000782,0.000102,0.000196,5.8e-05,0.000101,0.000213,0.000124,0.000299,0.000187,0.00042
min,0.0013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0003
25%,0.00255,0.0,0.0001,0.0,0.0,0.0,0.0,0.0002,0.0,0.0008
50%,0.0029,0.0,0.0002,0.0,0.0001,0.0001,0.0,0.00035,5e-05,0.001
75%,0.0035,0.0001,0.0003,0.0,0.0001,0.0002,0.0001,0.000525,0.0002,0.0013
max,0.0049,0.0005,0.0015,0.0003,0.0007,0.0017,0.0007,0.0016,0.0008,0.0028


In [None]:
# export
def get_mean_cross_entropy(df: pd.DataFrame, model: Model, n: Optional[int] = None):
    """
    Get the mean cross entropy for a model on an entire pandas dataframe

    :param df: the pandas dataframe containing each method to have the model predict on
    :param model: the model used to generate the predictions
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :returns: returns the mean cross entropy of the models predictions compared to true labels
    """
    if n is None:
        n = len(df)

    cross_entropy_losses = []
    # Need to change to sparse_categorical_crossentropy
    for mthd in df.code.values[:n]:
        # token the method and get the probabilities for each token from the model
        inputs = model.tokenize(mthd)
        probs = model.get_probs(inputs)[0].numpy()

        # calculate the cross entropy between the labels and probabilities
        losses = tf.keras.losses.sparse_categorical_crossentropy(
            inputs["input_ids"], probs
        ).numpy()
        cross_entropy_losses.append(losses)

    # flatten list of cross entropies and calculate the mean, median, std, and mad
    cross_entropy_losses = np.concatenate(cross_entropy_losses)
    return {
        "mean": np.mean(cross_entropy_losses),
        "median": np.median(cross_entropy_losses),
        "std": np.std(cross_entropy_losses),
        "mad": stats.median_abs_deviation(cross_entropy_losses),
    }

In [None]:
cross_entropy_losses = []
for mthd in df_fake.code.values:
    inputs = gru_model.tokenize(mthd)
    probs = gru_model.get_probs(inputs)[0].numpy()

    losses = tf.keras.losses.sparse_categorical_crossentropy(
        inputs["input_ids"], probs
    ).numpy()
    cross_entropy_losses.append(losses)

CROSS_ENTROPY_MEAN = np.mean(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_MEDIAN = np.median(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_STD = np.std(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_MAD = stats.median_abs_deviation(np.concatenate(cross_entropy_losses))
cross_entropy = get_mean_cross_entropy(df_fake, gru_model)

assert np.isclose(CROSS_ENTROPY_MEAN, cross_entropy["mean"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_MEDIAN, cross_entropy["median"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_STD, cross_entropy["std"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_MAD, cross_entropy["mad"], atol=1.0e-6)

In [None]:
cross_entropy_losses = []
for mthd in df_fake.code.values:
    inputs = trnsfr_model.tokenize(mthd)
    probs = trnsfr_model.get_probs(inputs)[0].numpy()

    losses = tf.keras.losses.sparse_categorical_crossentropy(
        inputs["input_ids"], probs
    ).numpy()
    cross_entropy_losses.append(losses)

CROSS_ENTROPY_MEAN = np.mean(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_MEDIAN = np.median(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_STD = np.std(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_MAD = stats.median_abs_deviation(np.concatenate(cross_entropy_losses))
cross_entropy = get_mean_cross_entropy(df_fake, trnsfr_model)

assert np.isclose(CROSS_ENTROPY_MEAN, cross_entropy["mean"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_MEDIAN, cross_entropy["median"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_STD, cross_entropy["std"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_MAD, cross_entropy["mad"], atol=1.0e-6)

In [None]:
# export
def get_mean_cross_entropy_df(df: pd.DataFrame, model: Model, bs = 16, n: Optional[int] = None):
    """
    Get the mean cross entropy for a model on an entire pandas dataframe

    :param df: the pandas dataframe containing each method to have the model predict on
    :param model: the model used to generate the predictions
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :returns: returns the mean cross entropy of the models predictions compared to true labels
    """
    if n is None:
        n = len(df)

    cross_entropy_losses = []
    for i in range(0, n, bs):
        batch = ["<sos>" + mthd for mthd in df.code.values[i:i + bs]]
        # token the method and get the probabilities for each token from the model
        inputs = tf.stack([x.ids for x in model.tokenizer.encode_batch(batch)], axis = 0)
        logits = model.model(inputs)
        probs = tf.nn.softmax(logits).numpy()

        # calculate the cross entropy between the labels and probabilities
        losses = tf.keras.losses.sparse_categorical_crossentropy(
            inputs, probs
        ).numpy()
        cross_entropy_losses.extend(np.mean(losses, axis = 1))
    
    new_df = pd.DataFrame(
        zip(df.code.values[:n], cross_entropy_losses),
        columns=["code", "y_cross_entropy"]
    )

    return new_df

In [None]:
bugfix_path = Path("/home/jovyan/work/dvc-icodegen/datasets/controlled/testbeds/_ts_bug_fix")
df_buggy = pd.read_json(bugfix_path / "buggy.jsonl", orient="records", lines=True)[
    :10
]
model = RNNModel.from_path("/home/jovyan/work/dvc-icodegen/models/controlled/rnns/rnn_layers1_vocab10000_embed256_units1024")

In [None]:
cross_entropy = get_mean_cross_entropy_df(df_buggy, model)
cross_entropy.describe()

Unnamed: 0,y_cross_entropy
count,10.0
mean,5.607144
std,1.313763
min,3.711073
25%,4.687577
50%,5.793782
75%,6.330061
max,8.028292


In [None]:
cross_entropy = get_mean_cross_entropy_df(df_fake, gru_model)
cross_entropy

InvalidArgumentError: Shapes of all inputs must match: values[0].shape = [23] != values[1].shape = [13] [Op:Pack]

In [None]:
# export
_TRANSFORMs = {
#     "randomized_tokens": code_token_randomizer,
#     "randomized_lines": line_randomizer,
    "comments_removed": java_comment_remover,
}

In [None]:
[False, True] * 10

[False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True]

In [None]:
# export
def _get_metrics(df, model):
#     mean_probs = get_mean_probs(df, model)
    error_taxonomy_df = get_error_rates_df(df, model)
#     df_dist = mean_dist_probs(df, model)
    mean_cross_entropy_df = get_mean_cross_entropy_df(df, model)

    return {
        "error_taxonomy": error_taxonomy_df,
#         "dist_mean": df_dist,
        "mean_cross_entropy": mean_cross_entropy_df,
    }


# def _long_range(data_dir, model, n=None):
#     long_range_results = {}

#     df_buggy = pd.read_json(data_dir / "buggy.jsonl", orient="records", lines=True)[:n]
#     long_range_results["buggy"] = _get_metrics(df_buggy, model)
#     del df_buggy

#     df_fixed = pd.read_json(data_dir / "fixed.jsonl", orient="records", lines=True)[:n]
#     long_range_results["fixed"] = _get_metrics(df_fixed, model)
#     del df_fixed

#     df_codesearchnet = pd.read_json(
#         data_dir / "codesearchnet_java" / "test.jsonl", orient="records", lines=True
#     )[:n]
#     long_range_results["codesearchnet_original"] = _get_metrics(df_codesearchnet, model)

#     for transform in _TRANSFORMs:
#         df_transformed = transform_df(df_codesearchnet, _TRANSFORMs[transform])
#         long_range_results["codesearchnet_" + transform] = _get_metrics(
#             df_transformed, model
#         )
#         del df_transformed

#     return long_range_results


def _long_range(bigclone_path, bugfix_path, codesearchnet_path, model, out_path, n=None):
    long_range_results = {}

    # TODO add bigclone data

    df_buggy = pd.read_json(bugfix_path / "buggy.jsonl", orient="records", lines=True)[
        :n
    ]
    buggy_metrics = _get_metrics(df_buggy, model)

    df_fixed = pd.read_json(bugfix_path / "fixed.jsonl", orient="records", lines=True)[
        :n
    ]
    fixed_metrics = _get_metrics(df_fixed, model)
    
    bug_fix_err_df = pd.concat(
        [buggy_metrics["error_taxonomy"], fixed_metrics["error_taxonomy"]]
    ).sort_index().reset_index(drop=True)
    bug_fix_err_df["x_treatment"] = [False, True] * len(buggy_metrics["error_taxonomy"])
    bug_fix_err_df.to_json(out_path / "bug_fix_error_taxonomy.jsonl", orient="records", lines=True)
    
    bug_fix_cross_df = pd.concat(
        [buggy_metrics["mean_cross_entropy"], fixed_metrics["mean_cross_entropy"]]
    ).sort_index().reset_index(drop=True)
    bug_fix_cross_df["x_treatment"] = [False, True] * len(buggy_metrics["mean_cross_entropy"])
    bug_fix_cross_df.to_json(out_path / "bug_fix_cross_entropy.jsonl", orient="records", lines=True)

#     df_codesearchnet = pd.read_json(
#         codesearchnet_path / "codesearchnet_java" / "test.jsonl",
#         orient="records",
#         lines=True,
#     )[:n]
#     long_range_results["codesearchnet_original"] = _get_metrics(df_codesearchnet, model)

#     for transform in _TRANSFORMs:
#         df_transformed = transform_df(df_codesearchnet, _TRANSFORMs[transform])
#         long_range_results["codesearchnet_" + transform] = _get_metrics(
#             df_transformed, model
#         )

    return long_range_results


def _counterfactual(control_results, treatment_results):
    pass


def evaluate(data_path, model_path):
    """Function for evaluating models related to the library."""
    results = defaultdict(dict)
    testbed_path = data_path / "controlled/testbeds"
    #     models = []
    # These model folders will need to contain the config of the model as well
    # to differentiate them
    for m_path in model_path.glob("*/"):
        model = None
        print(m_path)
        model = RNNModel.from_path(m_path)
#         if m_path.name == "Transformer":
#             model = TransformerModel.from_path(m_path)
#         elif "rnn" in m_path.name:
#             model = RNNModel.from_path(m_path)
#         elif m_path.name == "RNN":
#             pass
#         return model
    
        bigclone_path = testbed_path / "_ts_bigclone_types"
        bugfix_path = testbed_path / "_ts_bug_fix"
        codesearchnet_path = testbed_path / "codesearchnet"

        # Long-Range Interactions
#         results[m_path.name]["long_range"] = 
        _long_range(
            bigclone_path, bugfix_path, codesearchnet_path, model, bugfix_path, n=10
        )
#     return dict(results)


path = Path("/home/jovyan/work")
data_path = path / "dvc-icodegen/datasets"
model_path = path / "dvc-icodegen/models/controlled/rnns/"
results = evaluate(data_path, model_path)
        # Long-Range Interactions
#         results[m_path.name]["long_range"] = _long_range(
#             bigclone_path, bugfix_path, codesearchnet_path, model
#         )

#     return results
    # Counterfactuals


#         results[m_path]["counterfactual"] = _counterfactual(data_dir, model)
# _counterfactual(control_results, treatment_results)

# Save results in json format
# Long-Range Interactions
#     long_range_results = _long_range(data_dir, models)
#     long_range_results

#     # Counterfactuals
#     counterfactual_results = []
#     counterfactual_results
#     for transform in _TRANSFORMs:
#         pass
# _counterfactual(control_results, treatment_results)

/home/jovyan/work/dvc-icodegen/models/controlled/rnns/rnn_layers1_vocab10000_embed256_units1024
/home/jovyan/work/dvc-icodegen/models/controlled/rnns/gru_layers1_vocab10000_embed256_units512


In [None]:
results

{'gru_layers1_vocab10000_embed256_units512': {'long_range': {'buggy': {'error_taxonomy': {'blocks': {'<{>': {'error_rate': 1.0,
       'count': 28},
      '<}>': {'error_rate': 1.0, 'count': 28},
      '<[>': {'error_rate': 1.0, 'count': 3},
      '<]>': {'error_rate': 1.0, 'count': 3},
      '<(>': {'error_rate': 1.0, 'count': 82},
      '<)>': {'error_rate': 0.8658536585365854, 'count': 82},
      '<;>': {'error_rate': 1.0, 'count': 53},
      '<return>': {'error_rate': 1.0, 'count': 12},
      'stats': {'mean_error_rate': 0.9832317073170731,
       'stdev_error_rate': 0.04436473235016844,
       'median_error_rate': 1.0,
       'mad_error_rate': 0.0}},
     'exceptions': {'<catch>': {'error_rate': 1.0, 'count': 1},
      '<try>': {'error_rate': 1.0, 'count': 1},
      '<finally>': {'error_rate': nan, 'count': 0},
      '<throw>': {'error_rate': nan, 'count': 0},
      '<throws>': {'error_rate': 1.0, 'count': 1},
      'stats': {'mean_error_rate': 1.0,
       'stdev_error_rate': 0.0,

In [None]:
# long_range_results = _long_range(Path("/tmp"), model, n=100)

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()

Converted 00_data.core.ipynb.
Converted 01_data.transforms.ipynb.
Converted 02_model.core.ipynb.
Converted 04_evaluation.core.ipynb.
Converted index.ipynb.
