## Load Validation Data

In [35]:
import pandas as pd

df = pd.read_pickle("../../eval_examples.pkl")
df

Unnamed: 0,input,label,prediction
0,<s>math<extra_id_5><extra_id_2><extra_id_5>\n\...,object,object
1,<s><Mock<extra_id_5><extra_id_3><extra_id_5>):...,tuple_nonempty,tuple_nonempty
2,<s>losses<extra_id_5><extra_id_2><extra_id_5>]...,object,object
3,<s>filename<extra_id_5><extra_id_2><extra_id_5...,str_empty,str_nonempty
4,<s>cert<extra_id_5><extra_id_2><extra_id_5> Ch...,tuple_nonempty,
...,...,...,...
11345,<s>models<extra_id_5><extra_id_2><extra_id_5>c...,object,object
11346,<s>self<extra_id_5><extra_id_2><extra_id_5>.it...,object,object
11347,<s>result<extra_id_5><extra_id_2><extra_id_5> ...,object,object
11348,<s>widget_attrs<extra_id_5><extra_id_2><extra_...,dict_nonempty,dict_nonempty


## Overall Accuracy

In [36]:
def accuracy(df):
    return len(df[df["label"] == df["prediction"]]) / len(df)

accuracy(df)

0.8014096916299559

## Per-kind Accuracy (name vs attribute vs call)

In [37]:
name_df = df[df["input"].str.contains("<extra_id_5><extra_id_2>")]
print(f"Name     : {accuracy(name_df)}")
attribute_df = df[df["input"].str.contains("<extra_id_5><extra_id_4>")]
print(f"Attribute: {accuracy(attribute_df)}")
call_df = df[df["input"].str.contains("<extra_id_5><extra_id_3>")]
print(f"Call     : {accuracy(call_df)}")

Name     : 0.801017354877319
Attribute: 0.8408619975134687
Call     : 0.7603195739014648


## Per-category Accuracy (i.e., comparing different abstracted values)

In [60]:
all_categories = set(df["label"])

print(f"{len(all_categories)} categories")

def accuracy_per_category(df, category):
    sub_df = df[df["label"] == category]
    return accuracy(sub_df)

def most_common_mispredictions(df):
    mispredictions = df[df["label"] != df["prediction"]]["prediction"]
    all = mispredictions.value_counts()
    top = all.head(5)
    s = ""
    for i in range(len(top)):
        perc = round(top[i]/len(mispredictions), 2)
        s += f"{top.index[i]}={perc}, "
    return s

pd.options.display.max_colwidth = 100
raw_category_accuracies = [[c, accuracy_per_category(df,c), len(df[df["label"] == c]), most_common_mispredictions(df[df["label"] == c])] for c in all_categories]
category_accuracies = pd.DataFrame(raw_category_accuracies, columns=["category", "accuracy", "count", "mispredictions"])
category_accuracies.sort_values(by="accuracy", ascending=False)
    


22 categories


Unnamed: 0,category,accuracy,count,mispredictions
21,callable,0.97936,2907,"str_nonempty=0.32, None=0.18, object=0.17, resource=0.08, int_neg=0.07,"
19,object,0.911176,3400,"str_nonempty=0.31, None=0.22, list_nonempty=0.13, dict_nonempty=0.1, callable=0.07,"
7,str_nonempty,0.873964,1206,"None=0.28, str_empty=0.16, object=0.16, int_pos=0.13, callable=0.05,"
10,dict_nonempty,0.74537,432,"dict_empty=0.6, str_nonempty=0.11, list_nonempty=0.1, None=0.09, int_pos=0.04,"
4,resource,0.732394,71,"callable=0.47, dict_nonempty=0.16, object=0.11, tuple_nonempty=0.05, None=0.05,"
5,list_nonempty,0.667442,430,"list_empty=0.41, str_nonempty=0.17, tuple_nonempty=0.1, object=0.08, None=0.08,"
14,int_pos,0.651282,390,"int_zero=0.25, str_nonempty=0.25, int_neg=0.15, None=0.1, object=0.06,"
17,False,0.613079,367,"True=0.73, str_nonempty=0.06, None=0.05, int_pos=0.04, int_neg=0.04,"
13,tuple_nonempty,0.607735,181,"list_nonempty=0.38, tuple_empty=0.14, str_nonempty=0.13, None=0.1, object=0.07,"
15,,0.587537,674,"str_nonempty=0.37, object=0.18, int_pos=0.1, dict_nonempty=0.08, list_nonempty=0.05,"
