In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
sns.set(style="ticks", font_scale=1.2)
plt.rc("axes.spines", top=False, right=False)

from utils import TASK_META_VAR_MAP, ITEM_META_VAR_MAP, N_LAYERS

OUTPUT measures (4): ['output_entropy', 'output_rank_correct', 'output_logprob_correct', 'output_logprobdiff']
PROCESS measures (15): ['auc_entropy', 'layer_biggest_change_entropy', 'auc_rank_correct', 'layer_biggest_change_rank_correct', 'auc_logprob_correct', 'layer_biggest_change_logprob_correct', 'auc_logprobdiff_pos', 'auc_logprobdiff_neg', 'layer_biggest_change_logprobdiff', 'auc_boosting_pos', 'auc_boosting_neg', 'layer_argmax_boosting', 'twostage_magnitude', 'twostage_magnitude_latter34', 'twostage_layer']


# 0. Helper functions and global variables for plotting

In [2]:
TASK = "vision"

# Define list of models.
MODELS = ["vit_base_patch16_224", "vit_small_patch16_224"]
VOCAB_SIZE = 16

TASK_META_VARS = TASK_META_VAR_MAP[TASK]
ITEM_META_VARS = ITEM_META_VAR_MAP[TASK]

# Define output IVs (from the final layer).
OUTPUT_IV_MAP = {
    "entropy": "output_entropy",
    "rank_correct": "output_rank_correct",
    "logprob_correct": "output_logprob_correct",
}
OLD_OUTPUT_IVS = ["entropy", "rank_correct", "logprob_correct"]
OUTPUT_IVS = [OUTPUT_IV_MAP[i] for i in OLD_OUTPUT_IVS]
print(f"OUTPUT measures ({len(OUTPUT_IVS)}):", OUTPUT_IVS)

METRIC_MAP = {
    "Entropy": "entropy",
    "Layerwise Reciprocal Rank": "rank_correct",
    "Log Probability": "logprob_correct",
    "Probability": "prob_correct"
}

# Define process IVs based purely on probabilities.
PROB_IVS = []
for output_iv in OUTPUT_IVS:
    # Get base name of metric (e.g., "entropy")
    iv = output_iv.replace("output_", "")
    # Add AUC and biggest change measures.
    PROB_IVS.append(f"auc_{iv}")
    PROB_IVS.append(f"layer_biggest_change_{iv}")

PROCESS_IVS = PROB_IVS
print(f"PROCESS measures ({len(PROCESS_IVS)}):", PROCESS_IVS)

OUTPUT measures (3): ['output_entropy', 'output_rank_correct', 'output_logprob_correct']
PROCESS measures (6): ['auc_entropy', 'layer_biggest_change_entropy', 'auc_rank_correct', 'layer_biggest_change_rank_correct', 'auc_logprob_correct', 'layer_biggest_change_logprob_correct']


# 1. Read and process model outputs

## Read data and do basic processing

In [3]:
def read_model_data(model="vit_base_patch16_224", logprob=False):
    print(f"Reading model data for the following model: {model}")
    data_dir = f"../../src/vision/model-vs-human/raw-data/metrics/{model}"
    datasets = listdir(data_dir)
    entropy_df = pd.concat([
        pd.read_csv(f"{data_dir}/{dataset}/Entropy.csv")
        for dataset in datasets
    ])
    rank_df = pd.concat([
        pd.read_csv(f"{data_dir}/{dataset}/Layerwise Reciprocal Rank.csv")
        for dataset in datasets
    ])
    prob_df = pd.concat([
        pd.read_csv(f"{data_dir}/{dataset}/Probability.csv")
        for dataset in datasets
    ])
    prob_metric = "prob_correct"
    if logprob:
        prob_df["metric_name"] = "Log Probability"
        prob_df["value"] = np.log(prob_df["value"])
        prob_metric = "logprob_correct"
    
    dfs = [entropy_df, rank_df, prob_df]
    index = ITEM_META_VARS + ["layer"]
    for i, df in enumerate(dfs):
        metric = df["metric_name"].values[0] 
        df = df.drop(columns=["metric_name"]).rename(columns={
            "value": METRIC_MAP[metric],
            "subj": "model",
            "stimulus": "item_id"
        }).set_index(index)
        dfs[i] = df
        
    df = dfs[0].merge(dfs[1], on=index).merge(dfs[2], on=index).reset_index()
    
    # Add column for whether the layer is the final layer.
    # This is model-dependent.
    max_layer = df["layer"].max()
    df["is_final_layer"] = df["layer"] == max_layer

    return df

In [4]:
df = pd.concat([
    read_model_data(model=model, logprob=True)
    for model in MODELS
])
df.head()

Reading model data for the following model: vit_base_patch16_224
Reading model data for the following model: vit_small_patch16_224


Unnamed: 0,model,item_id,dataset_name,layer,entropy,rank_correct,logprob_correct,is_final_layer
0,vit_base_patch16_224,0001_lp_dnn_40_chair_10_n04099969_7577.png,low-pass,0,2.365237,0.25,-2.346391,False
1,vit_base_patch16_224,0001_lp_dnn_40_chair_10_n04099969_7577.png,low-pass,1,2.550632,0.333333,-2.239163,False
2,vit_base_patch16_224,0001_lp_dnn_40_chair_10_n04099969_7577.png,low-pass,2,2.595342,0.166667,-2.617676,False
3,vit_base_patch16_224,0001_lp_dnn_40_chair_10_n04099969_7577.png,low-pass,3,2.64383,0.111111,-2.792522,False
4,vit_base_patch16_224,0001_lp_dnn_40_chair_10_n04099969_7577.png,low-pass,4,2.650365,0.090909,-3.162579,False


## Compute all metrics

In [5]:
def get_index_of_biggest_change(vals, negative_change=False):
    if negative_change:
        vals = -vals
    changes = [vals[i+1] - vals[i] for i in range(0, len(vals)-1)]
    idx = np.argmax(changes)
    return idx

def area_above_0(vals):
    return sum([v for v in vals if v > 0])

def area_below_0(vals):
    return sum([abs(v) for v in vals if v < 0])

def sum_of_neg_vals(vals):
    return sum([v for v in vals if v < 0])

def get_auc(vals, iv):
    if iv == "entropy":
        return vals.sum()
    elif iv == "rank_correct":
        return (vals - (1/VOCAB_SIZE)).sum()
    elif iv == "logprob_correct":
        return abs(vals).sum()
    else:
        raise ValueError(f"Unrecognized independent variable: {iv}")
        
def compute_all_metrics(df, task=TASK):
    index = ITEM_META_VAR_MAP[task]
    if "model" not in index:
        index = ["model"] + index
    
    # Get data corresponding to final layer for output measures.
    final_layer_df = df[df.is_final_layer].set_index(index)

    df = df.set_index(index)
    
    results = []
    
    for ind in df.index.unique():
        model = ind[0]
        item_data = df.loc[ind]
        # There should be one value per layer
        assert len(item_data) == N_LAYERS[model]

        # ~~~~~~~~~~~~~~~~~~~~~ Initialize metadata about the stimulus.
        meta_data = {k: item_data[k].values[0] for k in TASK_META_VARS}
        for i, index_var in enumerate(index):
            meta_data[index_var] = ind[i]

        # ~~~~~~~~~~~~~~~~~~~~~ (1) Add output metrics from final layer.
        output_data = {}
        for output_iv in OLD_OUTPUT_IVS:
            new_iv = OUTPUT_IV_MAP[output_iv]
            output_data[new_iv] = final_layer_df.loc[ind][output_iv]
            
        # ~~~~~~~~~~~~~~~~~~~~~ (2) Add static metrics from midpoint layer (another baseline).
        static_data = {}
        for output_iv in OLD_OUTPUT_IVS:
            new_iv = OUTPUT_IV_MAP[output_iv].replace("output_", "midpoint_")
            midpoint = int(N_LAYERS[model]/2) - 1
            static_data[new_iv] = item_data[item_data.layer==midpoint].loc[ind][output_iv]
            
        # ~~~~~~~~~~~~~~~~~~~~~ (3) Compute processing metrics.
        process_data = {}

        for var_name, clean_var_name in OUTPUT_IV_MAP.items():
            # Get base name of metric. Should be one of the following: 
            # "entropy", "rank_correct", "logprob_correct", "logprobdiff"
            iv = clean_var_name.replace("output_", "")

            # Add AUC measure(s).
            data = item_data[var_name]
            process_data[f"auc_{iv}"] = get_auc(data, iv)

            # Add biggest change measure.
            layer_biggest_change = get_index_of_biggest_change(
                data,
                negative_change=(iv == "entropy")
            )
            process_data[f"layer_biggest_change_{iv}"] = layer_biggest_change

        # Combine meta data, output metrics, and processing metrics.
        res = meta_data | output_data | static_data | process_data
        results.append(res)
        
    results = pd.DataFrame(results)
    return results

In [6]:
# Compute all model predictor metrics.
metrics = compute_all_metrics(df, task=TASK)
print(metrics.model.unique())
metrics.sort_values(by=ITEM_META_VAR_MAP[TASK]).head()

  item_data = df.loc[ind]


['vit_base_patch16_224' 'vit_small_patch16_224']


Unnamed: 0,model,item_id,dataset_name,output_entropy,output_rank_correct,output_logprob_correct,midpoint_entropy,midpoint_rank_correct,midpoint_logprob_correct,auc_entropy,layer_biggest_change_entropy,auc_rank_correct,layer_biggest_change_rank_correct,auc_logprob_correct,layer_biggest_change_logprob_correct
13440,vit_base_patch16_224,0001_cl_dnn_cr_oven_40_n04111531_14126.png,colour,0.012701,1.0,-0.001226,2.495817,0.090909,-3.672761,23.940528,9,5.332576,6,23.820913,5
7360,vit_base_patch16_224,0001_cop_dnn_c30_knife_10_n03041632_7380.png,contrast,0.162731,1.0,-0.0246,2.64925,0.142857,-2.703982,26.173969,9,6.892857,5,21.93917,9
4800,vit_base_patch16_224,0001_eid_dnn_1-10-10_oven_10_n04111531_17072.png,eidolonI,0.005365,1.0,-0.000485,2.507821,0.0625,-4.597938,26.01407,9,3.871266,8,29.818613,6
15520,vit_base_patch16_224,0001_eid_dnn_16-0-10_oven_10_n04111531_13542.png,eidolonIII,1.789495,0.071429,-4.660355,2.515134,0.0625,-4.567651,29.444332,10,0.552615,6,41.084239,6
10720,vit_base_patch16_224,0001_eid_dnn_64-3-10_car_10_n04285008_8113.png,eidolonII,2.451736,0.0625,-4.302301,2.521321,0.071429,-3.922789,30.703561,10,0.058471,4,47.739221,7


In [7]:
print("Checking that all metrics are in the columns:")
missing = False
for iv in OUTPUT_IVS + PROCESS_IVS:
    if iv not in metrics.columns:
        print("Missing the following IV:", iv)
        missing = True
if not missing:
    print("  ==> All metrics are in the columns! :)")

Checking that all metrics are in the columns:
  ==> All metrics are in the columns! :)


In [8]:
metrics.to_csv(
    f"../../data/model_output/processed/{TASK}_metrics_logit_lens.csv", index=False
)

# 4. Combine with human data

In [9]:
def zscore_col(df, col, group="subject_id"):
    ppt_means = df.groupby(group)[col].mean()
    ppt_stds = df.groupby(group)[col].std()
    df[f"{col}_zscore"] = df.apply(
        lambda r: (r[col]-ppt_means.loc[r[group]]) / ppt_stds.loc[r[group]],
        axis=1
    )
    return df

def combine_model_human_data(model_df, task=TASK):
    assert model_df.model.nunique()==1
    
    # Grab the IVs by excluding all meta variables.
    meta_vars = TASK_META_VAR_MAP[TASK] + ITEM_META_VAR_MAP[TASK]
    ivs = [c for c in model_df.columns if not c in meta_vars]
    
    data = {}
    
    # Read human trial-level data.
    for dataset in model_df.dataset_name.unique():
        model_data = model_df[model_df.dataset_name==dataset]
        
        human_folder = f"../../src/vision/model-vs-human/raw-data/{dataset}"
        trial_df = pd.concat([
            pd.read_csv(f"{human_folder}/{f}") for f in listdir(human_folder)
        ]).rename(columns={"subj": "subject_id", "imagename": "item_id"})
        trial_df["response_correct"] = (
            trial_df["category"] == trial_df["object_response"]
        )
        # Add z-scored RTs within participant.
        trial_df = zscore_col(trial_df, "rt", group="subject_id")
        # Edit some image names.
        if dataset in ["cue-conflict", "edge", "silhouette"]:
            trial_df["item_id"] = trial_df.item_id.apply(
                lambda i: "_".join(i.split("_")[6:])
            )
        else:
            trial_df["item_id"] = trial_df.item_id.apply(
                lambda i: "_".join(i.split("_")[3:])
            )
            model_data["item_id"] = model_data.item_id.apply(
                lambda i: "_".join(i.split("_")[3:])
            )
        index = "item_id"
        model_means = model_data.groupby(index)[ivs].mean().reset_index()
        trial_df = trial_df.merge(model_means, on=index)
        data[dataset] = trial_df
    return data

for model in metrics.model.unique():
    print(model)
    data = combine_model_human_data(metrics[metrics.model==model], task=TASK)
    for dataset, dataset_trials in data.items():
        print("  ***", dataset)
        dataset_trials.to_csv(
            f"../../data/human_model_combined/logit_lens/{TASK}_{dataset}_{model}.csv", 
            index=False
        )

vit_base_patch16_224


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["item_id"] = model_data.item_id.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["item_id"] = model_data.item_id.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["item_id"] = model_data.item_id.apply(
A value is trying to be set on a copy of a slice from a

  *** low-pass
  *** false-colour
  *** cue-conflict
  *** power-equalisation
  *** eidolonI
  *** edge
  *** phase-scrambling
  *** contrast
  *** high-pass
  *** stylized
  *** eidolonII
  *** rotation
  *** silhouette
  *** colour
  *** sketch
  *** eidolonIII
  *** uniform-noise
vit_small_patch16_224


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["item_id"] = model_data.item_id.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["item_id"] = model_data.item_id.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["item_id"] = model_data.item_id.apply(
A value is trying to be set on a copy of a slice from a

  *** low-pass
  *** false-colour
  *** cue-conflict
  *** power-equalisation
  *** eidolonI
  *** edge
  *** phase-scrambling
  *** contrast
  *** high-pass
  *** stylized
  *** eidolonII
  *** rotation
  *** silhouette
  *** colour
  *** sketch
  *** eidolonIII
  *** uniform-noise
