In [None]:
import warnings; warnings.filterwarnings("ignore")

In [None]:
import os, sys, json
import numpy as np
import pandas as pd
import seaborn as sns

from copy import copy
from glob import glob
from tqdm.auto import tqdm as tqdm
import matplotlib.pyplot as plt

In [None]:
sys.path.append('model_opts')
from model_options import *
model_options = get_model_options(model_type='imagenet', train_type='imagenet')
model_names = [model_options[model_option]['model_name'] for model_option in model_options]

In [None]:
all_response_data = (pd.read_csv('aesthetic_responses.csv')
                 .groupby(['Subj','ImageType','Image'])
                 .agg({'Rating': 'mean', 'RT': 'mean'}).reset_index())
all_response_data.columns = ['subject','image_type','image_name','rating','reaction_time']
response_data = all_response_data.groupby(['image_type','image_name'])['rating'].mean().reset_index()

In [None]:
all_response_data

### Response Statistics

In [None]:
from scipy.stats import pearsonr

oracle_corr_dictlist = []
data_i = copy(all_response_data)
for image_type in data_i['image_type'].unique():
    data_i_subset = data_i[data_i['image_type'] == image_type]
    for subject in data_i_subset['subject'].unique():
        group_data_i = (data_i_subset[data_i_subset['subject'] != subject].groupby('image_name')['rating']
                        .mean().reset_index()['rating']).to_numpy()
        subject_data_i = data_i_subset[data_i_subset['subject'] == subject]['rating'].to_numpy()
        oracle_corr_dictlist.append({'subject': subject, 'image_type': image_type, 
                                     'oracle_corr': pearsonr(subject_data_i, group_data_i)[0]})

oracle_corrs = pd.DataFrame(oracle_corr_dictlist)

In [None]:
oracle_corrs.groupby(['image_type'])['oracle_corr'].mean().reset_index()

In [None]:
from toolbox.reliability import split_half

splithalf_corr_dictlist = []

data_i = copy(all_response_data).drop('reaction_time', axis=1)
for image_type in data_i['image_type'].unique():
    data_i_subset = data_i[data_i['image_type'] == image_type]
    data_i_subset = data_i_subset.pivot(index='subject', columns='image_name', values='rating').to_numpy()
    splithalf_corr_dictlist.append({'image_type': image_type, 'splithalf_r': split_half(data_i_subset, n_splits=10000)[0]})

splithalf_corrs = pd.DataFrame(splithalf_corr_dictlist)

In [None]:
splithalf_corrs

### Data Processing

In [None]:
def process_model_data(model_name, orient='wide'):
    model_data = (pd.read_csv('feature_maps/{}.csv'.format(model_name))
                  .drop(['image_type','model_layer_index'], axis = 1))
    sparsity_data = (pd.read_csv('sparsity/{}.csv'.format(model_name))
                    .drop(['image_type','model_layer_index', 'mean_activity'], axis = 1))
    model_data = pd.merge(model_data, sparsity_data, on = ['image_name','model','model_layer'])
    data_wide = pd.merge(model_data, response_data, on = 'image_name')
    model_layers = data_wide['model_layer'].unique().tolist()
    data_wide['model_layer_index'] = data_wide.apply(lambda x: model_layers.index(x['model_layer']) + 1, axis = 1)
    id_columns = ['image_name','image_type','model', 'model_layer', 'model_layer_index', 'rating']
    data_wide = data_wide[id_columns + [col for col in data_wide.columns.to_list() if col not in id_columns]]
    data_long = pd.melt(data_wide, id_vars=id_columns, 
                var_name = 'metric', value_name='value')
    
    if orient == 'wide':
        return(data_wide)
    if orient == 'long':
        return(data_long)

In [None]:
target_model = 'alexnet'
data_wide = process_model_data(target_model)

In [None]:
data_wide

In [None]:
def process_corr_data(data_wide, orient='long'):
    model_layers = data_wide['model_layer'].unique().tolist()
    id_columns = ['model','image_type','model_layer', 'model_layer_index']
    corr_data_wide = (data_wide.groupby(id_columns)
             .corrwith(data_wide['rating']).reset_index().drop('rating',axis = 1))
    corr_data_long = pd.melt(corr_data_wide, id_vars = id_columns, 
                             var_name = 'metric', value_name='corr')
    
    if orient == 'wide':
        return(corr_data_wide)
    if orient == 'long':
        return(corr_data_long)

In [None]:
corr_data_long = process_corr_data(data_wide)

In [None]:
corr_data_long.image_type.unique()

In [None]:
corr_data_long.model_layer_index.unique()

In [None]:
corr_data_long.model.unique()

In [None]:
def max_transform(df, group_vars, measure_var = 'score', deduplicate=True):
    if not isinstance(group_vars, list):
        group_vars = list(group_vars)
    
    max_df = (df[df.groupby(group_vars)[measure_var]
                 .transform(max) == df[measure_var]]).reset_index(drop=True)
                 
    if deduplicate:
        max_df = max_df[~max_df.duplicated(group_vars + [measure_var])]
        
    return max_df

In [None]:
corr_max = max_transform(corr_data_long, group_vars=['model','image_type', 'metric'], measure_var = 'corr')

In [None]:
corr_max[corr_max['image_type'] == 'lsc'].sort_values(by='corr')

In [None]:
corr_max[(corr_max['image_type'] == 'lsc') & (corr_max['metric'] == 'mean_activity')]['model_layer'].iloc[0]

In [None]:
corr_data_long

In [None]:
g = sns.relplot(x='model_layer_index',y='corr', kind='line', hue='metric', col = 'image_type', data = corr_data_long)
[ax.set_xticks(np.sort(corr_data_long['model_layer_index'].unique())) for ax in g.axes.flat];
[ax.set_xticklabels(data_wide['model_layer'].unique().tolist(), rotation=90) for ax in g.axes.flat];

In [None]:
g = sns.relplot(x='model_layer',y='corr', kind='line', hue='metric', col = 'image_type', 
                data = corr_data_long.loc[corr_data_long['model_layer'].str.contains('Conv')]);

In [None]:
g = sns.relplot(x='model_layer',y='corr', kind='line', hue='metric', col = 'image_type', 
                data = corr_data_long.loc[corr_data_long['model_layer'].str.contains('MaxPool')]);

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from scipy.stats import pearsonr

In [None]:
data_i = corr_data_long[(corr_data_long['image_type'] == 'lsc') & (corr_data_long['metric'] == 'mean_activity')]
x = data_i['model_layer_index'].to_numpy().reshape(-1,1)
y = data_i['corr'].to_numpy()
regression = LinearRegression().fit(x,y)
regression.coef_

In [None]:
sns.regplot(x,y);

In [None]:
data_i = data_wide[(data_wide['image_type'] == 'lsc')]
y = data_i[(data_i['model_layer']=='Conv2d-1')]['rating'].to_numpy()
X = np.stack([data_i[(data_i['model_layer']==model_layer)]['mean_activity'].to_numpy() 
              for model_layer in data_i['model_layer'].unique()], axis = 1)

In [None]:
regression = Ridge(alpha=1.0).fit(X,y)
regression.score(X,y)

In [None]:
regression = RidgeCV(alphas=[1.0], store_cv_values=True, scoring='r2').fit(X,y)
pearsonr(y, regression.cv_values_.squeeze())[0]**2

In [None]:
target_model = 'alexnet'
data_long = process_model_data('alexnet', orient='long')

In [None]:
(data_long[(data_long['image_name'].isin(['art_105.jpg','art_109.jpg'])) & (data_long['metric'] == 'mean_activity')]
 .groupby('image_name')['value'].mean())

In [None]:
data_long[(data_long['metric'] == 'mean_activity')].groupby('image_name')['value'].mean().sort_values()

In [None]:
data_long[(data_long['metric'] == 'mean_cosine_to_imagenet')].groupby('image_name')['value'].mean().sort_values()

In [None]:
data_long.groupby('metric')['value'].min()

In [None]:
data_long.groupby('metric')['value'].std()

In [None]:
data_i = data_long[(data_long['image_type'] == 'lsc') & (data_long['metric'] == 'mean_activity')]
y = data_i[(data_i['model_layer']==data_i['model_layer'].unique()[0])]['rating'].to_numpy()
X = np.stack([data_i[(data_i['model_layer']==model_layer)]['value'].to_numpy() 
              for model_layer in data_i['model_layer'].unique()], axis = 1)
regression = Ridge(alpha=1.0).fit(X,y)
regression.score(X,y)

In [None]:
model_csvs = glob('feature_maps/*.csv')
target_models = [csv.split('/')[1].split('.')[0] for csv in model_csvs]

output_file = 'raw_results.csv'
if os.path.exists(output_file):
    raw_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dflist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model)
        data_wide = process_model_data(target_model)
        
        results_dflist.append(data_wide)

    raw_results = pd.concat(results_dflist)
    raw_results.to_csv(output_file, index = None)

In [None]:
model_csvs = glob('feature_maps/*.csv')
target_models = [csv.split('/')[1].split('.')[0] for csv in model_csvs]

output_file = 'correlation_results.csv'
if os.path.exists(output_file):
    corr_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dflist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model)
        data_wide = process_model_data(target_model)
        corr_data_long = process_corr_data(data_wide)
        
        results_dflist.append(corr_data_long)

    corr_results = pd.concat(results_dflist)
    corr_results.to_csv(output_file, index = None)

In [None]:
model_csvs = glob('feature_maps/*.csv')
target_models = [csv.split('/')[1].split('.')[0] for csv in model_csvs]

output_file = 'analysis_results.csv'
if os.path.exists(output_file):
    results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dictlist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model) 
        data_wide = process_model_data(target_model)
        model_layers = data_wide['model_layer'].unique()
        corr_results_subset = corr_results[corr_results['model']==target_model]
        corr_max = max_transform(corr_results_subset, group_vars=['model','image_type', 'metric'], measure_var = 'corr')

        for metric in tqdm(corr_data_long['metric'].unique(), leave=False):
            for image_type in tqdm(corr_data_long['image_type'].unique(), leave=False):
                corr_max_subset = corr_max[(corr_max['image_type'] == image_type) & (corr_max['metric'] == metric)]
                corr_max_layer = corr_max_subset['model_layer'].iloc[0]
                corr_max_depth = corr_max_subset['model_layer_relative'].iloc[0]
                corr_max_value = corr_max_subset['corr'].iloc[0]

                data_i = corr_results_subset[(corr_results_subset['image_type'] == image_type) & 
                                             (corr_results_subset['metric'] == metric)]
                x, y = data_i['model_layer_relative'].to_numpy().reshape(-1,1), data_i['corr'].to_numpy()
                regression = LinearRegression().fit(x,y)
                corr_depth_coef = regression.coef_[0]

                data_i = data_wide[(data_wide['image_type'] == image_type)]
                y = data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy()
                X = np.stack([data_i[(data_i['model_layer']==model_layer)][metric].to_numpy() 
                              for model_layer in model_layers], axis = 1)
                regression = RidgeCV(alphas=[1.0], store_cv_values=True, scoring='r2').fit(X,y)
                ridge_pearson_r2_gcv = pearsonr(y, regression.cv_values_.squeeze())[0]**2

                results_dictlist.append({'model': target_model, 'image_type': image_type, 'metric': metric,
                                         'ridge_pearson_r2_gcv': ridge_pearson_r2_gcv,
                                         'corr_max_value': corr_max_value, 
                                         'corr_max_layer': corr_max_layer,
                                         'corr_max_depth': corr_max_depth, 
                                         'model_depth': len(model_layers),
                                         'corr_max_relative_depth': corr_max_depth / len(model_layers), 
                                         'corr_depth_coef': corr_depth_coef})

    results = pd.DataFrame(results_dictlist)
    results.to_csv(output_file, index = None)

In [None]:
results['ridge_pearson_r2_gcv'] = np.round(results['ridge_pearson_r2_gcv'], 5)

In [None]:
max_transform(results[results['metric'] == 'mean_activity'], group_vars = ['metric', 'image_type'],
              measure_var = 'corr_max_value').sort_values(by=['image_type','corr_max_value'])

In [None]:
max_transform(results[results['metric'] == 'sparseness'], group_vars = ['metric', 'image_type'],
              measure_var = 'corr_max_value').sort_values(by=['image_type','corr_max_value'])

In [None]:
max_transform(results[results['metric'] == 'mean_activity'], group_vars = ['metric', 'image_type'],
              measure_var = 'ridge_pearson_r2_gcv').sort_values(by=['image_type','ridge_pearson_r2_gcv'])

In [None]:
max_transform(results[results['metric'] == 'sparseness'], group_vars = ['metric', 'image_type'],
              measure_var = 'ridge_pearson_r2_gcv').sort_values(by=['image_type','ridge_pearson_r2_gcv'])

In [None]:
results[results['metric'] == 'mean_activity'].groupby(['image_type'])['corr_max_relative_depth'].mean()

In [None]:
results[results['image_type'] == 'lsc'].sort_values(by='corr_max_value')

In [None]:
results.sort_values(by='ridge_standard_r2_nocv')

In [None]:
results.sort_values(by='ridge_pearson_r2_gcv')

In [None]:
results.groupby(['metric'])['corr_max_value'].mean().sort_values()

In [None]:
results.groupby(['metric'])['corr_max_value'].max().sort_values()

In [None]:
results.groupby(['metric'])['ridge_pearson_r2_gcv'].mean().sort_values()

In [None]:
results.groupby(['metric'])['ridge_pearson_r2_gcv'].max().sort_values()

In [None]:
oracle_corrs

In [None]:
oracle_corrs.groupby(['image_type'])['oracle_corr'].mean().reset_index()

In [None]:
results.groupby(['image_type'])['corr_max_value'].mean().sort_values()

In [None]:
results.groupby(['image_type'])['corr_max_value'].max().sort_values()

In [None]:
results.groupby(['image_type'])['ridge_pearson_r2_gcv'].mean().sort_values()

In [None]:
results.groupby(['image_type'])['ridge_pearson_r2_gcv'].max().sort_values()

In [None]:
results[results.image_type == 'lsc'].groupby(['metric'])['corr_max_value'].mean().sort_values()

In [None]:
results.groupby(['metric'])['corr_max_relative_depth'].mean().sort_values()

In [None]:
results.groupby(['image_type'])['corr_max_relative_depth'].mean().sort_values()

In [None]:
(results[results.image_type == 'lsc'].groupby(['model'])['corr_max_value'].mean()**2).sort_values()

In [None]:
results[results.image_type == 'lsc'].groupby(['model'])['ridge_pearson_r2_gcv'].mean().sort_values()

In [None]:
sns.relplot(x='model_depth', y='corr_max_value', row = 'metric', col = 'image_type', 
            data = results[results.metric.isin(['mean_activity','sparseness'])]);

In [None]:
sns.relplot(x='model_depth', y='ridge_pearson_r2_gcv', row = 'metric', col = 'image_type', 
            data = results[results.metric.isin(['mean_activity','sparseness'])]);

In [None]:
model_metadata = (pd.read_csv('model_opts/model_metadata.csv').rename(columns={'model_name': 'model'})
                  .query('train_type != "imagenet"').set_index('model')).drop('train_type', axis = 1)#.to_dict(orient='index'))

# metadata_dictlist = []
# for model in model_metadata:
#     metadata_i = {**model_metadata[model]}
#     metadata_i.pop('layer_metadata', None)
#     metadata_dictlist.append({'model': model, **metadata_i, 
#                               'imagenet_top1': model_metadata[model]['imagenet_top1'], 
#                               'imagenet_top5': model_metadata[model]['imagenet_top5']})
    
# model_params = pd.DataFrame(metadata_dictlist)

In [None]:
results_plus = pd.merge(results[['model','image_type','metric','ridge_pearson_r2_gcv']], model_metadata, on = ['model'])

In [None]:
results_plus

In [None]:
(results_plus[(results_plus['image_type'] == 'lsc') & (results_plus['metric'] == 'mean_activity')]
 .groupby('model')['ridge_pearson_r2_gcv'].mean().sort_values())

In [None]:
param_corrs = (results_plus.groupby(['image_type','metric']).corrwith(results_plus['ridge_pearson_r2_gcv'])
             .reset_index().drop('ridge_pearson_r2_gcv', axis = 1))

In [None]:
param_corrs[param_corrs['image_type'] == 'lsc']

In [None]:
param_corrs[param_corrs['image_type'] == 'fac']

In [None]:
corr_results

In [None]:
from sklearn.metrics import explained_variance_score as ev_score
from sklearn.metrics import r2_score 

def pearson_r2_score(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]**2

scoring_metrics = {'ev_score': ev_score, 'pearson_r2': pearson_r2_score}

model_csvs = glob('feature_maps/*.csv')
#target_models = [csv.split('/')[1].split('.')[0] for csv in model_csvs]
target_models = ['alexnet','vgg16','resnet18']

output_file = 'stepwise_regressions.csv'
if os.path.exists(output_file):
    reg_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dictlist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model) 
        data_wide = process_model_data(target_model)
        data_long = process_model_data(target_model, orient='long')
        model_layers = data_wide['model_layer'].unique()

        for metric in tqdm(data_long['metric'].unique(), leave=False):
            for image_type in tqdm(data_long['image_type'].unique(), leave=False):
                running_model_layer_list = []
                for model_layer_index, model_layer in enumerate(tqdm(model_layers, leave = False)):
                    running_model_layer_list.append(model_layer)
                    
                    data_i = data_wide[(data_wide['image_type'] == image_type)]
                    y = data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy()
                    X = np.stack([data_i[(data_i['model_layer']==model_layer)][metric].to_numpy() 
                                  for model_layer in running_model_layer_list], axis = 1)
                    regression = RidgeCV(alphas=[1.0], store_cv_values=True, scoring='r2').fit(X,y)
                    y_pred = regression.cv_values_.squeeze()
                    for scoring_metric in scoring_metrics:
                        score = scoring_metrics[scoring_metric](y, y_pred)
                        results_dictlist.append({'model': target_model, 'image_type': image_type, 'metric': metric,
                                                 'score_type': scoring_metric, 'score': score,
                                                 'model_layer': model_layer,
                                                 'model_depth': len(model_layers),
                                                 'model_layer_index': model_layer_index + 1,
                                                 'model_layer_depth': (model_layer_index + 1) / len(model_layers)})

    reg_results = pd.DataFrame(results_dictlist)
    #reg_results.to_csv(output_file, index = None)

In [None]:
reg_results

In [None]:
reg_results_subset = reg_results[(reg_results['score_type'] == 'ev_score')]
max_transform(reg_results_subset, group_vars = ['metric', 'score_type','image_type'])

In [None]:
reg_results_subset = reg_results[(reg_results['score_type'] == 'ev_score') & (reg_results['image_type'].isin(['lsc','fac']))]
max_transform(reg_results_subset, group_vars = ['metric', 'score_type','image_type'])

In [None]:
reg_results_subset = reg_results[(reg_results['score_type'] == 'ev_score') & 
                                 (reg_results['image_type'].isin(['lsc','fac']))]
max_transform(reg_results_subset, group_vars = ['metric', 'score_type','image_type'])

In [None]:
target_models = ['alexnet','vgg19','resnet18', 'densenet121','resnet101','resnet152']

plotting_subset = reg_results[(reg_results['metric'].isin(['mean_activity','sparseness','distance_to_imagenet_mean'])) &
                              (reg_results['score_type'] == 'ev_score') & (reg_results['model'].isin(target_models))]
sns.relplot(x = 'model_layer_depth', y = 'score', hue = 'model', col = 'image_type', row = 'metric', style = 'score_type',  
            kind = 'line', ci = False, data = plotting_subset);

In [None]:
target_models = ['alexnet','vgg19','resnet18', 'densenet121','resnet101','resnet152']

plotting_subset = reg_results[(reg_results['metric'].isin(['mean_activity','sparseness','distance_to_imagenet_mean'])) &
                              (reg_results['score_type'] == 'ev_score')]
p = sns.relplot(x = 'model_layer_depth', y = 'score', hue = 'model', col = 'image_type', row = 'metric', style = 'score_type',  
            kind = 'line', ci = False, data = plotting_subset);
p.set(ylim=(-0.1, 1.0));

In [None]:
max_transform(plotting_subset, measure_var = 'score', group_vars = ['image_type','metric'])

In [None]:
plotting_subset = reg_results[(reg_results['metric'].isin(['mean_activity','sparseness','distance_to_imagenet_mean'])) &
                              (reg_results['score_type'] == 'ev_score') & (reg_results['model_layer_depth'] == 1.0) & 
                              (reg_results['score'] > -0.1)]

sns.lmplot(x = 'model_depth', y = 'score', col = 'image_type', hue = 'metric', data = plotting_subset);

In [None]:
corr_subset = corr_results[(corr_results['model'] == 'vgg19') & (corr_results['image_type'].isin(['lsc'])) &
                           (corr_results['metric'].isin(['mean_activity','sparseness']))]

In [None]:
max_transform(corr_subset, group_vars = ['metric'], measure_var = 'corr')

In [None]:
plot_data = process_model_data('alexnet')
plot_data = plot_data[plot_data['image_type'] == 'lsc']

In [None]:
sns.lmplot(x = 'mean_activity', y = 'rating', col = 'model_layer_index', order = 1, col_wrap = 6, 
            data=plot_data, sharex = False);

In [None]:
sns.lmplot(x = 'mean_activity', y = 'rating', col = 'model_layer_index', order = 1, col_wrap = 6, 
            data=plot_data, sharex = True);

In [None]:
sns.lmplot(x = 'mean_activity', y = 'rating', col = 'model_layer_index', order = 2, col_wrap = 6, 
            data=plot_data, sharex = False);

In [None]:
sns.lmplot(x = 'sparseness', y = 'rating', col = 'model_layer_index', order = 1, col_wrap = 6, 
            data=plot_data, sharex = False);

In [None]:
sns.lmplot(x = 'sparseness', y = 'rating', col = 'model_layer_index', order = 1, col_wrap = 6, 
            data=plot_data, sharex = True);

In [None]:
arr1 = np.random.randn(3)
arr2 = np.random.randn(3)
arr3 = np.random.randn(3)
moving_average = np.mean(np.array((np.mean(arr1), np.mean(arr2), np.mean(arr3))))
combined_average = np.mean(np.stack((arr1,arr2,arr3)))
print('moving average:', moving_average)
print('combined average:', combined_average)