In [None]:
# default_exp evaluation.core

In [None]:
# hide
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Evaluation

> API details.

In [None]:
# export
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import Counter, defaultdict
from icodegen.model.core import Model
from scipy import stats
from typing import Dict, List, Optional

In [None]:
# hide
# Setting up testing data
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
from icodegen.model.core import TransformerModel, RNNModel

# Using tiny-gpt2 for just quick tests since it is... tiny :)
trnsfr_tokenizer = GPT2TokenizerFast.from_pretrained("sshleifer/tiny-gpt2")
tokenizer = trnsfr_tokenizer.backend_tokenizer
trnsfr = TFGPT2LMHeadModel.from_pretrained("sshleifer/tiny-gpt2")
trnsfr_model = TransformerModel(tokenizer, trnsfr)

rnn_type = "gru"
n_layers = 1
vocab_size = tokenizer.get_vocab_size()
embedding_dim = 128
rnn_units = 128
batch_size = 1
out_path = "/tmp"
gru_model = RNNModel(
    rnn_type,
    n_layers,
    vocab_size,
    embedding_dim,
    rnn_units,
    batch_size,
    out_path,
    tokenizer,
)

df_fake = pd.DataFrame(
    ["aaaa(bb(aaaa(bb()()ccc)dd)()ccc)dd", "aaaa(bb()ccccc)dd"], columns=["code"]
)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at sshleifer/tiny-gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
df_fake

Unnamed: 0,code
0,aaaa(bb(aaaa(bb()()ccc)dd)()ccc)dd
1,aaaa(bb()ccccc)dd


In [None]:
# export
def get_mean_probs(df: pd.DataFrame, model: Model, n: Optional[int] = None):
    """
    Get the mean probability of each token that the model
    should predict for an entire pandas dataframe.

    :param df: the pandas dataframe containing each method to have the model predict on
    :param model: the model used to generate the predictions
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :returns: returns a numpy array of the mean probability for each token in the model's vocab
    """
    if n is None:
        n = len(df)

    # setup container lists for the number of occurrences and sum of probabilities for each token
    counts = [0] * model.tokenizer.get_vocab_size()
    sum_probs = [0.0] * model.tokenizer.get_vocab_size()
    # loop through each method
    for mthd in df.code.values[:n]:
        # token the method and generate the probabilities for the model's predictions
        inputs = model.tokenize(mthd)
        probs = model.get_probs(inputs)[0].numpy()

        # loop through each token and its probability and update the container lists
        for idx, p in zip(inputs["input_ids"][0], probs):
            counts[idx] += 1
            sum_probs[idx] += p[idx]

    # convert the lists to numpy lists and perform element wise division to get the mean probabilities for each token
    counts = np.array(counts)
    sum_probs = np.array(sum_probs)

    # perform division, but not when denominator is zero. In those cases, just leave value as NAN.
    nans = np.empty(counts.shape)
    nans.fill(np.nan)
    mean_probs = np.divide(sum_probs, counts, out=nans, where=counts != 0)
    # TODO: convert to dictionary with keys as tokens
    mean_probs = {
        model.tokenizer.id_to_token(i): mean_probs[i] for i in range(len(mean_probs))
    }
    return mean_probs

In [None]:
NON_NAN_PROBS_MEAN = np.array(
    [
        2.01237513e-05,
        1.98944481e-05,
        2.01449202e-05,
        2.04353437e-05,
        2.02043060e-05,
        2.02826177e-05,
        2.09888076e-05,
        2.07051467e-05,
        1.98100976e-05,
        2.02152678e-05,
        2.02035244e-05,
        2.10283021e-05,
    ]
)

mean_probs = np.array(list(get_mean_probs(df_fake, trnsfr_model).values()))
non_nan_idx = np.argwhere(~np.isnan(mean_probs)).flatten()
non_nan_mean_prob = mean_probs[non_nan_idx]

assert np.isclose(non_nan_mean_prob, NON_NAN_PROBS_MEAN, atol=1.0e-6).all()

In [None]:
NON_NAN_PROBS_MEAN = np.array(
    [
        1.99270412e-05,
        1.99168703e-05,
        1.98815596e-05,
        1.99057849e-05,
        1.98800869e-05,
        1.98893995e-05,
        1.98797388e-05,
        1.98960342e-05,
        1.99086674e-05,
        1.98605580e-05,
        1.98807957e-05,
        1.98842057e-05,
    ]
)

mean_probs = np.array(list(get_mean_probs(df_fake, gru_model).values()))
non_nan_idx = np.argwhere(~np.isnan(mean_probs)).flatten()
non_nan_mean_prob = mean_probs[non_nan_idx]

assert np.isclose(non_nan_mean_prob, NON_NAN_PROBS_MEAN, atol=1.0e-6).all()

In [None]:
# export
def find_parens(toks: List[str], opening: str, closing: str) -> Dict[int, int]:
    """
    Get the indices for the opening and closing tokens.
    From https://stackoverflow.com/a/29992065/5768407
    by user Baltasarq (https://stackoverflow.com/users/266978/baltasarq).

    :param toks: the tokenized version of a method
    :param opening: the opening token that will be matched against the closing token
    :param closing: the closing token that will be matched against the opening token
    :returns: returns a dictionary with the opening token indices as the keys and the closing token indices as the values
    """
    toret = {}
    pstack = []

    for i, tok in enumerate(toks):
        if tok == opening:
            pstack.append(i)
        elif tok == closing:
            if len(pstack) == 0:
                raise IndexError("No matching closing parens at: " + str(i))
            toret[pstack.pop()] = i

    if len(pstack) > 0:
        raise IndexError("No matching opening parens at: " + str(pstack.pop()))

    return toret


def _get_dist_probs(
    mthd: str, model: Model, opening: str, closing: str
) -> Dict[int, float]:
    """
    Get the distances and mean probabilities between opening and closing tokens in a given method.

    :param mthd: the method to get the ranges of the opening and closing tokens and their probabilities
    :param model: the model used to generate the predictions
    :param opening: the opening token used for calculating the distance between opening and closing tokens
    :param closing: the closing token used for calculating the distance between opening and closing tokens as well as the token to get the mean probability of
    :returns: returns a dictionary with the distance between the opening and closing tokens as keys and their mean probabilities as values
    """
    # WARNING: Careful when using different tokenizers since HF tokenizers lib have diff API then HF transformers lib tokenizers... You will need to update this when using custom model and tokenizer...

    # get the distances for the opening and closing tokens
    toks = model.tokenizer.encode(mthd).tokens
    idxs = find_parens(toks, opening, closing)

    # get the model probabilities for the given method
    inputs = model.tokenize(mthd)
    probs = model.get_probs(inputs)[0].numpy()

    # sum up the probabilities of the different distances for the closing token
    dist_probs = defaultdict(float)
    for open_id, close_id in idxs.items():
        dist_probs[close_id - open_id] += probs[close_id][
            inputs["input_ids"][0][close_id]
        ]

    # get the mean of the summed probabilities
    dist_cnts = Counter([close_id - open_id for open_id, close_id in idxs.items()])
    dist_probs = {dist: dist_probs[dist] / n for dist, n in dist_cnts.items()}
    return dist_probs


def mean_dist_probs(
    df: pd.DataFrame,
    model: Model,
    opening: Optional[str] = "<{>",
    closing: Optional[str] = "<}>",
    n: Optional[int] = None,
) -> pd.DataFrame:
    """
    Get the distance between opening and closing tokens and the mean probability of each closing token that the model should predict for an entire pandas dataframe.

    :param df: the pandas dataframe containing each method to have the model predict on
    :param model: the model used to generate the predictions
    :param opening: the opening token used for calculating the distance between opening and closing tokens
    :param closing: the closing token used for calculating the distance between opening and closing tokens as well as the token to get the mean probability of
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :returns: returns a dataframe with the distances between opening and closing tokens and their mean probabilities
    """
    if n is None:
        n = len(df)

    # get the probabilities for the different distances for an entire dataframe
    df = df.iloc[:n].copy()
    dist_probs = df.code.apply(
        lambda mthd: _get_dist_probs(mthd, model, opening, closing)
    ).values

    # flatten the keys of the different distances into a list
    dist_keys = []
    for probs in dist_probs:
        dist_keys.extend(probs.keys())
    # merge dictionaries across methods by taking the mean of probs with the same distance. Modified from https://stackoverflow.com/a/10461916/5768407,
    # users georg https://stackoverflow.com/users/989121/georg and Rémy Hosseinkhan Boucher https://stackoverflow.com/users/12149730/r%c3%a9my-hosseinkhan-boucher
    dist_probs = {
        k: np.nanmean(np.array([probs.get(k, np.nan) for probs in dist_probs]))
        for k in set(dist_keys)
    }
    # TODO: convert to dictionary
    df_dist = (
        pd.DataFrame(
            {"dist": list(dist_probs.keys()), "mean_prob": list(dist_probs.values())}
        )
        .sort_values("dist")
        .reset_index(drop=True)
    )
    return df_dist

In [None]:
DIST_DF = pd.DataFrame(
    {
        "dist": [6, 10, 16],
        "mean_prob": [
            1.98822217e-05,
            1.97613608e-05,
            1.97816771e-05,
        ],
    }
)
df_dist = mean_dist_probs(df_fake, gru_model, opening="(", closing=")")

assert (DIST_DF.dist.values == df_dist.dist.values).all()
assert np.isclose(DIST_DF.mean_prob.values, df_dist.mean_prob.values, atol=1.0e-6).all()

In [None]:
DIST_DF = pd.DataFrame(
    {
        "dist": [6, 10, 16],
        "mean_prob": [
            1.98822217e-05,
            1.97613608e-05,
            1.97816771e-05,
        ],
    }
)
df_dist = mean_dist_probs(df_fake, trnsfr_model, opening="(", closing=")")

assert (DIST_DF.dist.values == df_dist.dist.values).all()
assert np.isclose(DIST_DF.mean_prob.values, df_dist.mean_prob.values, atol=1.0e-6).all()

In [None]:
# TODO: Need to move all these visualizations to their own module...
# TODO: make this binning process easier where I can just pass in some vars and it handles it for me
# df_dist["bin"] = pd.cut(
#     df_dist.dist, bins=[0, 10, 20], labels=["0-10", "11-20"], include_lowest=True
# )
# df_dist = df_dist.sort_values("dist")

# bars = {}
# for x in df_dist.bin.unique():
#     bars[x] = sum(df_dist.loc[df_dist.bin == x].mean_prob.values)

# plt.bar(bars.keys(), bars.values())

In [None]:
# export
def get_mean_cross_entropy(df: pd.DataFrame, model: Model, n: Optional[int] = None):
    """
    Get the mean cross entropy for a model on an entire pandas dataframe

    :param df: the pandas dataframe containing each method to have the model predict on
    :param model: the model used to generate the predictions
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :returns: returns the mean cross entropy of the models predictions compared to true labels
    """
    if n is None:
        n = len(df)

    cross_entropy_losses = []
    # Need to change to sparse_categorical_crossentropy
    for mthd in df.code.values[:n]:
        # token the method and get the probabilities for each token from the model
        inputs = model.tokenize(mthd)
        probs = model.get_probs(inputs)[0].numpy()

        # calculate the cross entropy between the labels and probabilities
        losses = tf.keras.losses.sparse_categorical_crossentropy(
            inputs["input_ids"], probs
        ).numpy()
        cross_entropy_losses.append(losses)

    # flatten list of cross entropies and calculate the mean, median, std, and mad
    cross_entropy_losses = np.concatenate(cross_entropy_losses)
    return {
        "mean": np.mean(cross_entropy_losses),
        "median": np.median(cross_entropy_losses),
        "std": np.std(cross_entropy_losses),
        "mad": stats.median_abs_deviation(cross_entropy_losses),
    }

In [None]:
cross_entropy_losses = []
for mthd in df_fake.code.values:
    inputs = gru_model.tokenize(mthd)
    probs = gru_model.get_probs(inputs)[0].numpy()

    losses = tf.keras.losses.sparse_categorical_crossentropy(
        inputs["input_ids"], probs
    ).numpy()
    cross_entropy_losses.append(losses)

CROSS_ENTROPY_MEAN = np.mean(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_MEDIAN = np.median(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_STD = np.std(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_MAD = stats.median_abs_deviation(np.concatenate(cross_entropy_losses))
cross_entropy = get_mean_cross_entropy(df_fake, gru_model)

assert np.isclose(CROSS_ENTROPY_MEAN, cross_entropy["mean"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_MEDIAN, cross_entropy["median"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_STD, cross_entropy["std"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_MAD, cross_entropy["mad"], atol=1.0e-6)

In [None]:
cross_entropy_losses = []
for mthd in df_fake.code.values:
    inputs = trnsfr_model.tokenize(mthd)
    probs = trnsfr_model.get_probs(inputs)[0].numpy()

    losses = tf.keras.losses.sparse_categorical_crossentropy(
        inputs["input_ids"], probs
    ).numpy()
    cross_entropy_losses.append(losses)

CROSS_ENTROPY_MEAN = np.mean(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_MEDIAN = np.median(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_STD = np.std(np.concatenate(cross_entropy_losses))
CROSS_ENTROPY_MAD = stats.median_abs_deviation(np.concatenate(cross_entropy_losses))
cross_entropy = get_mean_cross_entropy(df_fake, trnsfr_model)

assert np.isclose(CROSS_ENTROPY_MEAN, cross_entropy["mean"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_MEDIAN, cross_entropy["median"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_STD, cross_entropy["std"], atol=1.0e-6)
assert np.isclose(CROSS_ENTROPY_MAD, cross_entropy["mad"], atol=1.0e-6)

In [None]:
# export
def _get_metrics(df, model):
    mean_probs = get_mean_probs(df, model)
    df_dist = mean_dist_probs(df, model)
    mean_cross_entropy = get_mean_cross_entropy(df, model)

    return {
        "mean_probs": mean_probs,
        "dist_mean": df_dist,
        "mean_cross_entropy": mean_cross_entropy,
    }

In [None]:
# export
def _long_range(data_dir, model, n=None):
    long_range_results = {}

    df_buggy = pd.read_json(data_dir / "buggy.jsonl", orient="records", lines=True)[:n]
    long_range_results["buggy"] = _get_metrics(df_buggy, model)
    del df_buggy

    df_fixed = pd.read_json(data_dir / "fixed.jsonl", orient="records", lines=True)[:n]
    long_range_results["fixed"] = _get_metrics(df_fixed, model)
    del df_fixed

    df_codesearchnet = pd.read_json(
        data_dir / "codesearchnet_java" / "test.jsonl", orient="records", lines=True
    )[:n]
    long_range_results["codesearchnet_original"] = _get_metrics(df_codesearchnet, model)

    for transform in _TRANSFORMs:
        df_transformed = transform_df(df_codesearchnet, _TRANSFORMs[transform])
        long_range_results["codesearchnet_" + transform] = _get_metrics(
            df_transformed, model
        )
        del df_transformed

    return long_range_results

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()

Converted 00_data.core.ipynb.
Converted 01_data.transforms.ipynb.
Converted 02_model.core.ipynb.
Converted 04_evaluation.core.ipynb.
Converted index.ipynb.
