In [None]:
import warnings; warnings.filterwarnings("ignore")

In [None]:
import os, sys, json
import numpy as np
import pandas as pd
import pingouin as pg
import seaborn as sns

from copy import copy
from glob import glob
from tqdm.auto import tqdm as tqdm
import matplotlib.pyplot as plt

In [None]:
sys.path.append('model_opts')
from model_options import *
model_options = get_model_options()

In [None]:
%reload_ext rpy2.ipython

In [None]:
%%R --noreturn 
library('pacman')
p_load('tidyverse') 

In [None]:
all_response_data = (pd.read_csv('response/vessel_subject_data.csv')
                 .groupby(['Subj','ImageType','Image'])
                 .agg({'Rating': 'mean', 'RT': 'mean'}).reset_index())
all_response_data.columns = ['subject','image_type','image_name','rating','reaction_time']
response_data = all_response_data.groupby(['image_type','image_name'])['rating'].mean().reset_index()

In [None]:
all_response_data

### Data Processing

In [None]:
def process_model_data(model_string, orient='wide'):
    model_data = (pd.read_csv('metrics/vessel/{}.csv'.format(model_string)))
    model_data = model_data.drop(['model_layer_index'], axis = 1)
    model_data = model_data.rename(columns={'image': 'image_name'})
    data_wide = pd.merge(model_data, response_data, on = 'image_name')
    data_wide['model_layer_depth'] = data_wide['model_layer_depth'] + 1
    id_columns = ['image_name','image_type','model','train_type','model_layer','model_layer_depth','rating']
    data_wide = data_wide[id_columns + [col for col in data_wide.columns.to_list() if col not in id_columns]]
    data_long = pd.melt(data_wide, id_vars=id_columns, 
                var_name = 'metric', value_name='value')
    
    if orient == 'wide':
        return(data_wide)
    if orient == 'long':
        return(data_long)

In [None]:
target_model = 'alexnet_imagenet'
data_wide = process_model_data(target_model)

In [None]:
data_wide

In [None]:
def process_corr_data(data_wide, orient='long'):
    model_layers = data_wide['model_layer'].unique().tolist()
    id_columns = ['model','train_type','image_type','model_layer', 'model_layer_depth']
    corr_data_wide = (data_wide.groupby(id_columns)
             .corrwith(data_wide['rating']).reset_index().drop('rating',axis = 1))
    corr_data_long = pd.melt(corr_data_wide, id_vars = id_columns, 
                             var_name = 'metric', value_name='corr')
    
    if orient == 'wide':
        return(corr_data_wide)
    if orient == 'long':
        return(corr_data_long)

In [None]:
process_corr_data(data_wide)

In [None]:
def max_transform(df, group_vars, measure_var = 'score', transform = max, deduplicate=True):
    if not isinstance(group_vars, list):
        group_vars = [group_vars]
    
    max_df = (df[df.groupby(group_vars)[measure_var]
                 .transform(max) == df[measure_var]]).reset_index(drop=True)
                 
    if deduplicate:
        max_df = max_df[~max_df.duplicated(group_vars + [measure_var])]
        
    return max_df

def min_transform(df, group_vars, measure_var = 'score', transform = max, deduplicate=True):
    if not isinstance(group_vars, list):
        group_vars = [group_vars]
    
    min_df = (df[df.groupby(group_vars)[measure_var]
                 .transform(min) == df[measure_var]]).reset_index(drop=True)
                 
    if deduplicate:
        min_df = min_df[~min_df.duplicated(group_vars + [measure_var])]
        
    return min_df

In [None]:
model_csvs = glob('metrics/vessel/*.csv')
target_models = [csv.split('/')[1].split('.')[0] for csv in model_csvs]

output_file = 'results/raw_results2.csv'
if os.path.exists(output_file):
    raw_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dflist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model)
        data_wide = process_model_data(target_model)
        
        results_dflist.append(data_wide)

    raw_results = pd.concat(results_dflist)
    raw_results.to_csv(output_file, index = None)

In [None]:
model_csvs = glob('metrics/vessel/*.csv')
target_models = [csv.split('/')[1].split('.')[0] for csv in model_csvs]

output_file = 'results/correlation_results2.csv'
if os.path.exists(output_file):
    corr_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dflist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model)
        data_wide = process_model_data(target_model)
        corr_data_long = process_corr_data(data_wide)
        
        results_dflist.append(corr_data_long)

    corr_results = pd.concat(results_dflist)
    corr_results.to_csv(output_file, index = None)
    
corr_results['corr_abs'] = abs(corr_results['corr'])
corr_results['corr_sq'] = corr_results['corr']**2

In [None]:
from pandas.core.base import PandasObject
def your_fun(df):
    ...
PandasObject.your_fun = your_fun

In [None]:
sns.displot(x = 'corr', hue = 'train_type', col = 'image_type', row = 'metric', kde = True, data = corr_results);

In [None]:
np.round((corr_results.query("train_type not in '{}' & image_type == '{}' & metric == '{}'".format('taskonomy','lsc','sparseness'))
 .groupby(['model','train_type','image_type','metric'])['corr'].min().reset_index().sort_values(by='corr')),5)

In [None]:
(corr_results.query("train_type in '{}' & image_type == '{}' & metric == '{}'".format('taskonomy','lsc','mean_activity'))
 .groupby(['model','train_type','image_type','metric'])['corr'].max().reset_index().sort_values(by='corr'))

In [None]:
data_out = (corr_results.groupby(['model','train_type','image_type','metric'])['corr_sq'].max()
             .reset_index().sort_values(by='corr_sq'))

In [None]:
%%R -i data_out -o train_type_stats

pacman::p_load('rstatix')
        
train_type_stats <- data_out %>% filter(train_type != 'taskonomy') %>%
    group_by(model, train_type, image_type, metric) %>%
    filter(corr_sq == max(corr_sq)) %>% group_by(image_type, metric) %>%
    {left_join(t_test(., corr_sq ~ train_type, paired = TRUE) %>% 
                   adjust_pvalue(method = 'bonferroni') %>% add_significance('p.adj'),
               cohens_d(., corr_sq ~ train_type, paired = TRUE))} %>% 
    select(image_type, metric, group1, group2, p.adj, p.adj.signif, effsize, magnitude)

In [None]:
train_type_stats

In [None]:
plot_data = (corr_results.query("train_type not in 'taskonomy' & image_type == 'lsc' & metric == 'mean_activity'")
      .groupby(['model','train_type','image_type','metric'])['corr'].max().reset_index().sort_values(by='corr'))
pg.plot_paired(data = plot_data, dv = 'corr', within = 'train_type', subject = 'model');

In [None]:
plot_data = (corr_results.query("train_type not in 'taskonomy' & image_type == 'lsc' & metric == 'mean_activity'")
      .groupby(['model','train_type','image_type','metric'])['corr_abs'].max().reset_index().sort_values(by='corr_abs'))
pg.plot_paired(data = plot_data, dv = 'corr_abs', within = 'train_type', subject = 'model');

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.metrics import explained_variance_score as ev_score
from sklearn.metrics import r2_score 
from sklearn.preprocessing import scale
from scipy.stats import pearsonr

def pearson_r2_score(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]**2

scoring_metrics = {'ev_score': ev_score, 'pearson_r2': pearson_r2_score}

model_csvs = glob('metrics/vessel/*.csv')
target_models = [csv.split('/')[-1].split('.')[0] for csv in model_csvs]
target_models = [model for model in target_models if 'googlenet' not in model]

output_file = 'results/analysis_results2.csv'
if os.path.exists(output_file):
    reg_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dictlist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model) 
        data_wide = process_model_data(target_model)
        model_layers = data_wide['model_layer'].unique()
        model_name = model_options[target_model]['model_name']
        train_type = model_options[target_model]['train_type']
        corr_results_subset = process_corr_data(data_wide)
        corr_max = min_max_transform(corr_results_subset, group_vars=['model','train_type','image_type', 'metric'], 
                                 measure_var = 'corr')

        data_long = process_model_data(target_model, orient = 'long')
        for metric in tqdm(data_long['metric'].unique(), leave=False):
            for image_type in tqdm(data_long['image_type'].unique(), leave=False):
                corr_max_subset = corr_max[(corr_max['image_type'] == image_type) & (corr_max['metric'] == metric)]
                corr_max_layer = corr_max_subset['model_layer'].iloc[0]
                corr_max_depth = corr_max_subset['model_layer_depth'].iloc[0]
                corr_max_score = corr_max_subset['corr'].iloc[0]

                # regressing correlation value on depth
                data_i = corr_results_subset[(corr_results_subset['image_type'] == image_type) & 
                                             (corr_results_subset['metric'] == metric)]
                x, y = data_i['model_layer_depth'].to_numpy().reshape(-1,1), data_i['corr'].to_numpy()
                regression = LinearRegression().fit(x,y)
                corr_depth_coef = regression.coef_[0]

                # regressing metric across layers on aesthetic rating
                data_i = data_wide[(data_wide['image_type'] == image_type)]
                y = data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy()
                X = np.stack([data_i[(data_i['model_layer']==model_layer)][metric].to_numpy()
                              for model_layer in model_layers], axis = 1)
                alpha_values = np.array([0.01, 0.5, 1.0, 1.5, 3.0, 5.0, 10.0])
                regression = RidgeCV(alphas=alpha_values, store_cv_values=True, 
                                     scoring='explained_variance').fit(X,y)
                ridge_gcv_score, ridge_gcv_alpha = regression.best_score_, regression.alpha_
                ridge_max_layer = model_layers[np.argmax(regression.coef_)]
                ridge_max_depth = np.argmax(regression.coef_)
                
                #regression ridge coefficient on depth
                x, y = data_i['model_layer_depth'].unique().reshape(-1,1), regression.coef_
                regression = LinearRegression().fit(x,y)
                ridge_depth_coef = regression.coef_[0]
                
                results_dictlist.append({'model': model_name, 'train_type': train_type, 
                                         'image_type': image_type, 'metric': metric,
                                         'model_depth': len(model_layers),
                                         'ridge_gcv_score': ridge_gcv_score,
                                         'ridge_gcv_alpha': ridge_gcv_alpha,
                                         'ridge_max_layer': ridge_max_layer,
                                         'ridge_max_depth': ridge_max_depth,
                                         'ridge_depth_coef': ridge_depth_coef,
                                         'ridge_max_relative_depth': ridge_max_depth / len(model_layers),
                                         'corr_max_score': corr_max_score, 
                                         'corr_max_layer': corr_max_layer,
                                         'corr_max_depth': corr_max_depth,
                                         'corr_depth_coef': corr_depth_coef,
                                         'corr_max_relative_depth': corr_max_depth / len(model_layers)})

    reg_results = pd.DataFrame(results_dictlist)
    reg_results.to_csv(output_file, index = None)
    
reg_results = np.round(reg_results, 5)

In [None]:
%%R -h 600 -w 750 --res 100 -i reg_results

pairplot_data <- reg_results %>%
    filter(train_type != 'taskonomy') %>%
    mutate(score = ridge_gcv_score) %>%
    group_by(model, metric, image_type) %>%
    mutate(slope = (score[train_type=='imagenet'] - score[train_type=='random'])/(2-1)) %>%
    mutate(train_type = fct_recode(as.factor(train_type), ImageNet = 'imagenet', Random = 'random'))

pairplot_data %>%
    filter(ridge_gcv_score > -0.5) %>%
    ggplot(aes(train_type, ridge_gcv_score)) + 
    facet_grid(vars(metric), vars(image_type)) +
    geom_boxplot(width = 0.15) + geom_point() + 
    geom_line(aes(group=model, col = slope > 0)) +
    theme_bw() + ylab('Score') + xlab('Weights') + guides(color = FALSE)
#https://stackoverflow.com/questions/40745163/jupyter-notebook-rpy2-rmagics-how-to-set-the-default-plot-size

In [None]:
pairplot_data = (reg_results.query("train_type != '{}' & image_type == '{}' & metric == '{}'"
                                   .format('taskonomy','lsc','mean_activity')))
pg.plot_paired(data = pairplot_data, dv = 'ridge_gcv_score', within = 'train_type', subject = 'model');

In [None]:
reg_results.query("image_type == 'lsc' & metric == 'mean_activity' & train_type == 'taskonomy'").sort_values(by='ridge_gcv_score')

In [None]:
reg_results.groupby(['metric'])['ridge_gcv_score'].mean()

In [None]:
reg_results[reg_results['train_type'] != 'random'].groupby(['metric'])['ridge_gcv_score'].mean()

In [None]:
reg_results[reg_results['train_type'] == 'imagenet'].groupby(['metric','image_type'])['ridge_depth_coef'].mean()

In [None]:
%%R -h 650 -w 1000 --res 100 -i reg_results -o plot_data

plot_data <- reg_results %>% mutate(score = ridge_gcv_score) %>% 
    filter(train_type == 'taskonomy') %>%
    filter(image_type %in% c('lsc')) %>%
    filter(!str_detect(model,'random_weights')) %>%
    mutate(rank = sprintf("%02i", as.integer(rank(score))))
    
plot_data %>% 
      {ggplot(., aes(rank, score)) +
      geom_bar(stat = 'identity', position = 'identity') + 
      xlab('Model') + ylab('Score (Pearson R)') + labs(fill = 'Task Cluster') +
      scale_x_discrete(labels = with(., model %>% set_names(rank))) +
      facet_wrap(~interaction(metric, image_type, sep = ' | '), scales = 'free') +
      geom_label(aes(y = 0.7, label = round(score, 3)), show.legend = FALSE) +
      theme(legend.position="bottom",
            legend.justification="center", 
            legend.box.margin=margin(-12,0,0,0)) +
      coord_flip(ylim = c(0,0.75), clip = 'on') + theme_bw()}
    
#print(with(plot_data, model %>% set_names(rank)))

In [None]:
plot_data = reg_results.query("train_type == 'taskonomy'")

sns.catplot(x='ridge_gcv_score', y='model', col = 'image_type', row = 'metric', kind = 'bar', dodge = False, 
            data=plot_data);

In [None]:
from scipy.stats import pearsonr, spearmanr
x = reg_results.query("image_type == 'lsc' & metric == 'sparseness' & train_type == 'imagenet'")['model_depth']
y = reg_results.query("image_type == 'lsc' & metric == 'sparseness' & train_type == 'imagenet'")['ridge_gcv_score']
sns.scatterplot(x,y); spearmanr(x,y), pearsonr(x,y)

In [None]:
from scipy.stats import pearsonr, spearmanr
x = reg_results.query("image_type == 'lsc' & metric == 'mean_activity' & train_type == 'imagenet'")['model_depth']
y = reg_results.query("image_type == 'lsc' & metric == 'mean_activity' & train_type == 'imagenet'")['ridge_gcv_score']
sns.scatterplot(x,y); spearmanr(x,y), pearsonr(x,y)

In [None]:
(max_transform(reg_results, group_vars = ['metric', 'image_type'], measure_var = 'ridge_gcv_score')
 .sort_values(by=['ridge_gcv_score']))

### Regression Permutations

In [None]:
target_model = 'densenet201_imagenet'
data_wide = process_model_data(target_model)
model_layers = data_wide['model_layer'].unique()
model_name = model_options[target_model]['model_name']
train_type = model_options[target_model]['train_type']

data_i = data_wide[(data_wide['image_type'] == 'lsc')]
y = data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy()
X = np.stack([data_i[(data_i['model_layer']==model_layer)]['mean_activity'].to_numpy() 
              for model_layer in model_layers], axis = 1)
alpha_values = np.array([0.01, 0.5, 1.0, 1.5, 3.0, 5.0, 10.0])
regression = RidgeCV(alphas=alpha_values, store_cv_values=True, 
                     scoring='explained_variance').fit(X,y)
ridge_gcv_score = regression.best_score_; print(ridge_gcv_score)

In [None]:
permuted_ridge_gcv_scores = []
for i in tqdm(range(100)):
    y = np.random.permutation(data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy())
    X = np.stack([data_i[(data_i['model_layer']==model_layer)]['mean_activity'].to_numpy() 
              for model_layer in model_layers], axis = 1)
    alpha_values = np.array([0.01, 0.5, 1.0, 1.5, 3.0, 5.0, 10.0])
    regression = RidgeCV(alphas=alpha_values, store_cv_values=True, 
                         scoring='explained_variance').fit(X,y)
    permuted_ridge_gcv_scores.append(regression.best_score_)

In [None]:
sns.displot(permuted_ridge_gcv_scores);

In [None]:
random_activity_gcv_scores = np.zeros(100)
for i in tqdm(range(100)):
    y = data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy()
    X = np.stack([data_i[(data_i['model_layer']==model_layer)]['mean_activity'].to_numpy() 
              for model_layer in model_layers], axis = 1)
    X = np.random.randn(*X.shape)
    alpha_values = np.array([0.01, 0.5, 1.0, 1.5, 3.0, 5.0, 10.0])
    regression = RidgeCV(alphas=alpha_values, store_cv_values=True, 
                         scoring='explained_variance').fit(X,y)
    random_activity_gcv_scores[i] = regression.best_score_

In [None]:
sns.displot(random_activity_gcv_scores);

### Combo Regressions

In [None]:
target_model = 'alexnet_imagenet'
data_wide = process_model_data(target_model)
model_layers = data_wide['model_layer'].unique()
model_name = model_options[target_model]['model_name']
train_type = model_options[target_model]['train_type']

In [None]:
data_i = data_wide[(data_wide['image_type'] == 'lsc')]
y = data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy()
X1 = np.stack([data_i[(data_i['model_layer']==model_layer)]['mean_activity'].to_numpy() 
              for model_layer in model_layers], axis = 1)
X2 = np.stack([data_i[(data_i['model_layer']==model_layer)]['sparseness'].to_numpy() 
              for model_layer in model_layers], axis = 1)
X = np.concatenate([X1, X2], axis = 1)
alpha_values = np.array([0.01, 0.5, 1.0, 1.5, 3.0, 5.0, 10.0])
regression = RidgeCV(alphas=alpha_values, store_cv_values=True, 
                     scoring='explained_variance').fit(X,y)
ridge_gcv_score = regression.best_score_; print(ridge_gcv_score)

In [None]:
model_csvs = glob('metrics/vessel/*.csv')
target_models = [csv.split('/')[-1].split('.')[0] for csv in model_csvs]
target_models = [model for model in target_models if 'googlenet' not in model]

output_file = 'results/combo_regressions2.csv'
if os.path.exists(output_file):
    combo_reg_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dictlist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model) 
        data_wide = process_model_data(target_model)
        model_layers = data_wide['model_layer'].unique()
        model_name = model_options[target_model]['model_name']
        train_type = model_options[target_model]['train_type']

        for image_type in tqdm(data_wide['image_type'].unique(), leave=False):
            data_i = data_wide[(data_wide['image_type'] == image_type)]
            y = data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy()
            X1 = np.stack([data_i[(data_i['model_layer']==model_layer)]['mean_activity'].to_numpy() 
                           for model_layer in model_layers], axis = 1)
            X2 = np.stack([data_i[(data_i['model_layer']==model_layer)]['sparseness'].to_numpy() 
                           for model_layer in model_layers], axis = 1)
            X = np.concatenate([X1, X2], axis = 1)
            alpha_values = np.array([0.01, 0.5, 1.0, 1.5, 3.0, 5.0, 10.0])
            regression = RidgeCV(alphas=alpha_values, store_cv_values=True, 
                                 scoring='explained_variance').fit(X,y)
            ridge_gcv_score, ridge_gcv_alpha = regression.best_score_, regression.alpha_

            results_dictlist.append({'model': model_name, 'train_type': train_type, 
                                     'image_type': image_type, 'metric': 'combo',
                                     'model_depth': len(model_layers),
                                     'ridge_gcv_score': ridge_gcv_score,
                                     'ridge_gcv_alpha': ridge_gcv_alpha})
            
    combo_reg_results = pd.DataFrame(results_dictlist)
    combo_reg_results.to_csv(output_file, index = None)

In [None]:
(max_transform(combo_reg_results, measure_var = 'ridge_gcv_score', group_vars = ['image_type', 'metric'])
 .sort_values(by='ridge_gcv_score'))

In [None]:
(max_transform(reg_results, measure_var = 'ridge_gcv_score', group_vars = ['image_type','metric'])
 .sort_values(by='ridge_gcv_score')).loc[:,:'ridge_gcv_alpha']

### Stepwise Regressions

In [None]:
model_csvs = glob('metrics/vessel/*.csv')
#target_models = [csv.split('/')[1].split('.')[0] for csv in model_csvs]
target_models = ['alexnet_imagenet','alexnet_random']

output_file = 'results/stepwise_regressions2.csv'
if os.path.exists(output_file):
    step_reg_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dictlist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model) 
        data_wide = process_model_data(target_model)
        data_long = process_model_data(target_model, orient='long')
        model_layers = data_wide['model_layer'].unique()

        for metric in tqdm(data_long['metric'].unique(), leave=False):
            for image_type in tqdm(data_long['image_type'].unique(), leave=False):
                running_model_layer_list = []
                for model_layer_index, model_layer in enumerate(tqdm(model_layers, leave = False)):
                    running_model_layer_list.append(model_layer)
                    
                    data_i = data_wide[(data_wide['image_type'] == image_type)]
                    y = data_i[(data_i['model_layer']==model_layers[0])]['rating'].to_numpy()
                    X = np.stack([data_i[(data_i['model_layer']==model_layer)][metric].to_numpy() 
                                  for model_layer in running_model_layer_list], axis = 1)
                    regression = RidgeCV(alphas=[1.0], store_cv_values=True, scoring='explained_variance').fit(X,y)
                    ridge_gcv_score, ridge_gcv_alpha = regression.best_score_, regression.alpha_
                    results_dictlist.append({'model': model_name, 'train_type': train_type,
                                             'image_type': image_type, 'metric': metric,
                                             'score': ridge_gcv_score, 
                                             'alpha': ridge_gcv_alpha,
                                             'model_layer': model_layer,
                                             'model_depth': len(model_layers),
                                             'model_layer_index': model_layer_index + 1,
                                             'model_layer_depth': (model_layer_index + 1) / len(model_layers)})

    step_reg_results = pd.DataFrame(results_dictlist)
    #reg_results.to_csv(output_file, index = None)

In [None]:
step_reg_results

In [None]:
max_transform(step_reg_results, group_vars = ['metric','image_type']).sort_values(by='score')

In [None]:
target_models = ['alexnet','vgg19','resnet18', 'densenet121','resnet101','resnet152']

plotting_subset = step_reg_results[(step_reg_results['metric'].isin(['mean_activity','sparseness'])) &
                                   (step_reg_results['model'].isin(target_models))]
p = sns.relplot(x = 'model_layer_depth', y = 'score', hue = 'model', col = 'image_type', row = 'metric', style = 'train_type', 
            kind = 'line', ci = False, data = plotting_subset, facet_kws={'sharey': False});
p.set(ylim=(-0.1,1.0));

In [None]:
max_transform(plotting_subset, measure_var = 'score', group_vars = ['image_type','metric']).sort_values(by='score')

In [None]:
plotting_subset = step_reg_results[(step_reg_results['metric'].isin(['mean_activity','sparseness'])) &
                                   ~(step_reg_results['train_type'].isin(['taskonomy'])) & 
                                   (step_reg_results['model_layer_depth'] == 1.0) & 
                                   (step_reg_results['score'] > -0.1)]

sns.lmplot(x = 'model_depth', y = 'score', col = 'image_type', row = 'train_type', hue = 'metric', data = plotting_subset);

In [None]:
corr_subset = corr_results[(corr_results['model'] == 'vgg19') & (corr_results['image_type'].isin(['lsc'])) &
                           (corr_results['metric'].isin(['mean_activity','sparseness']))]

In [None]:
max_transform(corr_subset, group_vars = ['metric'], measure_var = 'corr')

In [None]:
plot_data = process_model_data('alexnet_imagenet')
plot_data = plot_data[plot_data['image_type'] == 'lsc']

In [None]:
sns.lmplot(x = 'mean_activity', y = 'rating', col = 'model_layer_depth', order = 1, col_wrap = 6, 
            data=plot_data, sharex = False);

### Feature Regression

In [None]:
model_data = (pd.read_csv('regression/oasis/{}.csv'.format('alexnet_imagenet')))
model_data = model_data.rename(columns={'score': 'ridge_gcv_score', 'penalty':'ridge_penalty'})

In [None]:
model_depths = reg_results.groupby(['model','train_type'])['model_depth'].mean().reset_index()

In [None]:
model_csvs = glob('regression/oasis/*.csv')
target_models = [csv.split('/')[-1].split('.')[0] for csv in model_csvs]
target_models = [model for model in target_models if 'googlenet' not in model]

output_file = 'results/oasis_regression.csv'
if os.path.exists(output_file):
    neural_reg_results = pd.read_csv(output_file)

if not os.path.exists(output_file):
    results_dflist = []
    iterator = tqdm(target_models)
    for target_model in iterator:
        iterator.set_description(target_model) 
        model_data = (pd.read_csv('regression/oasis/{}.csv'.format(target_model)))
        model_data['model_layer_index'] = model_data['model_layer_depth']
        model_data['model_depth'] = len(model_data['model_layer_index'].unique())
        #model_data = model_data.merge(model_depths, on = ['model','train_type'])
        model_data['model_layer_depth'] = model_data['model_layer_depth'] / model_data['model_depth']
         
        results_dflist.append(model_data)

    neural_reg_results = pd.concat(results_dflist)
    neural_reg_results.to_csv(output_file, index = None)

In [None]:
max_transform(neural_reg_results, measure_var = 'score', group_vars = ['category']).sort_values(by='score')

In [None]:
reg_layer_max = max_transform(neural_reg_results, measure_var = 'score', 
                              group_vars = ['model','train_type','category', 'measurement']).sort_values(by='score')

In [None]:
pg.pairwise_ttests(data = reg_layer_max[reg_layer_max['train_type'] != 'random'], 
                   dv = 'score', between = ['measurement'], subject = 'model')

In [None]:
pg.pairwise_ttests(data = reg_layer_max[reg_layer_max['train_type'] == 'imagenet'], 
                   dv = 'score', between = ['measurement'], subject = 'model', padjust = 'bonf')

In [None]:
neural_reg_results.groupby(['train_type','category'])['score'].max().reset_index()

In [None]:
neural_reg_results.groupby(['model','category'])['score'].mean().reset_index().sort_values(by='score')

In [None]:
%%R -h 600 -w 750 --res 100 -i neural_reg_results -o pairplot_data

pairplot_data <- neural_reg_results %>%
    filter(train_type != 'taskonomy') %>%
    group_by(model, train_type, measurement, category) %>% 
    summarise(score = max(score)) %>%
    group_by(model, measurement, category) %>%
    mutate(slope = (score[train_type=='imagenet'] - score[train_type=='random'])/(2-1)) %>%
    mutate(train_type = fct_recode(as.factor(train_type), ImageNet = 'imagenet', Random = 'random'))

pairplot_data %>%
    ggplot(aes(train_type, score)) + 
    facet_grid(measurement~category) +
    geom_boxplot(width = 0.15) + geom_point() + 
    geom_line(aes(group=model, col = slope > 0)) +
    scale_color_manual(values= c('TRUE' = 'cyan3', 'FALSE' = 'red3')) +
    theme_bw() + ylab('Score') + xlab('Weights') + guides(color = FALSE)

In [None]:
pairplot_data[pairplot_data['model'] == 'alexnet']

In [None]:
%%R -h 650 -w 1000 --res 100 -i neural_reg_results -o plot_data

plot_data <- neural_reg_results %>% 
    group_by(model, train_type, category) %>%
    filter(score == max(score)) %>% ungroup() %>%
    filter(train_type == 'taskonomy') %>%
    filter(!str_detect(model,'random_weights')) %>%
    mutate(rank = sprintf("%03i", as.integer(rank(score))))
    
plot_data %>% {ggplot(., aes(rank, score)) +
      geom_bar(stat = 'identity', position = 'identity') + 
      xlab('Model') + ylab('Score (Pearson R)') + 
      scale_x_discrete(labels = with(., model %>% set_names(rank))) +
      facet_wrap(~category, scales = 'free') +
      #geom_label(aes(y = 0.85, label = round(score, 3)), show.legend = FALSE) +
      coord_flip(ylim = c(-0.1,1.0), clip = 'on') + theme_bw()}

In [None]:
plot_data[(plot_data['category'] == 'Scene') & (plot_data['measurement'] == 'beauty')].sort_values(by='rank')

In [None]:
sns.relplot(x = 'model_layer_depth', y = 'score', hue = 'category', kind = 'line', 
            data = neural_reg_results.query("train_type != 'random'"));