In [4]:
import numpy as np
import pandas as pd
from scipy.spatial import distance

def to_str(num):
    if num < 10:
        return '0'+str(num)
    else:
        return str(num)

In [16]:
df = pd.read_csv('../data/exp_with_coding_full.csv')
print(df.shape)

df = df.query("note!='wrong task'")
print(df.shape, 'after discarding wrong task')

df = df.query("not (condition==['p','r','p_r','r_p'] and hit_char==False)")
print(df.shape, 'after discarding not hit char')

(2173, 61)
(2172, 61) after discarding wrong task
(2152, 61) after discarding not hit char


In [17]:
df.to_csv('../data/exp_with_coding.csv', index=True, index_label='no')

## add similarity score

In [18]:
df = pd.read_csv('../data/exp_with_coding.csv', index_col='no')
df['match_seg_num'] = np.nan

In [19]:
# USE
info_embeds = np.load('../data/info_embeds2.npy')
exp_embeds = np.load('../data/exp_embeds2.npy')

In [20]:
def add_scores(row, df=df):
    
    # print(row.name)
    res_embed = [exp_embeds[row.name]]
    
# info
    # info_embed = info_embeds[info_22['segment'] == row['segment']]
    # row['res_1_simi_info'] = 1 - distance.cosine(res_embed, info_embed)
    # row['res_1_simi_info_z'] = np.arctanh(row['res_1_simi_info'])
    
# self recall (recall_f or recall_b)
    if row['condition'] in (['p','p_r','r','r_p','p0','r0']):
        self_recall_embed =  exp_embeds[  
            df[((df['sub'] == row['sub']) & (df['segment'] == row['segment']) & (df['condition'].str.startswith('recall')))].index  ]
        row['res_1_simi_self'] = 1 - distance.cosine(res_embed, self_recall_embed)
        row['res_1_simi_self_z'] = np.arctanh(row['res_1_simi_self'])
        
# other recall: median (retreat p_recall or r_recall as ground truth)
    if row['condition'] in ['p0','p','p_r','r_p_truc0','r_p_truc','p_recall']:
        other_recall_cond = 'p_recall'
    elif row['condition'] in ['r0','r','r_p','p_r_truc0','p_r_truc','r_recall']:
        other_recall_cond = 'r_recall'
    
    if row['condition'] in ['p0','p','p_r','r_p_truc0','r_p_truc','p_recall',
                           'r0','r','r_p','p_r_truc0','p_r_truc','r_recall']:
        
        other_recall_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==other_recall_cond) & (df['sub']!=row['sub']))].index  ]
        other_recall_rs = 1 - (distance.cdist(res_embed, other_recall_embeds, 'cosine'))   
        row['res_1_simi_other'] = np.tanh(np.median(np.arctanh(other_recall_rs)))
        row['res_1_simi_other_z'] = np.median(np.arctanh(other_recall_rs))
    
# other segments recall: median
    for s in range(1,12):
        col = 'res_1_simi_otherseg_' + to_str(s)
        seg_recall_conds = ['p_recall','p_r_recall','recall_f','r_recall','r_p_recall','recall_b']

        if s!=row['segment_num']:
            other_seg_recall_embeds = exp_embeds[
               df[((df['story']==row['story']) & (df['segment_num']==s) & (df['condition'].isin(seg_recall_conds)))].index ]
        elif s==row['segment_num']:
            other_seg_recall_embeds = exp_embeds[
               df[((df['story']==row['story']) & (df['segment_num']==s) & (df['condition'].isin(seg_recall_conds)) & (df['sub']!=row['sub']))].index ]
        
        row[col] = np.median(1 - (distance.cdist(res_embed, other_seg_recall_embeds, 'cosine')))
    row['match_seg_num'] = int(row['res_1_simi_otherseg_01':'res_1_simi_otherseg_11'].astype(float).idxmax()[-2:])

# same cond median absolute difference (MAD) and mean absolute difference (MD)
    # opposite cond
    opposite_cond_set = {'p0':'r0','r0':'p0','p':'r','r':'p','p_r':'r_p','r_p':'p_r','p_recall':'r_recall','r_recall':'p_recall',
                         'p_r_recall':'r_p_recall','r_p_recall':'p_r_recall','recall_f':'recall_b','recall_b':'recall_f',
                        'p_r_truc0':'r_p_truc0','r_p_truc0':'p_r_truc0','p_r_truc':'r_p_truc','r_p_truc':'p_r_truc'}
    opposite_cond = opposite_cond_set[row['condition']]
    
    if row['condition'] in ['p','r','p_r','r_p']:  # ensure hit char
        same_cond_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==row['condition']) & (df['hit_char']))].index ]
        same_cond_other_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==row['condition']) & (df['sub']!=row['sub']) & (df['hit_char']))].index ]
        oppo_cond_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==opposite_cond) & (df['hit_char']))].index ]
    else:
        same_cond_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==row['condition']))].index ]
        same_cond_other_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==row['condition']) & (df['sub']!=row['sub']))].index ]
        oppo_cond_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==opposite_cond))].index ]
        
    pairwise_values = 1 - (distance.pdist(same_cond_embeds, 'cosine'))
    pairwise_values_other = 1 - (distance.cdist(res_embed, same_cond_other_embeds, 'cosine'))
        
    row['res_1_MAD'] = np.tanh(np.median(np.arctanh(pairwise_values)))
    row['res_1_MAD_z'] = np.median(np.arctanh(pairwise_values))
    
    row['res_1_MD'] = np.tanh(np.mean(np.arctanh(pairwise_values)))
    row['res_1_MD_z'] = np.mean(np.arctanh(pairwise_values))
    
    row['res_1_MAD_sub'] = np.tanh(np.median(np.arctanh(pairwise_values_other)))
    row['res_1_MAD_sub_z'] = np.median(np.arctanh(pairwise_values_other))

    row['res_1_MD_sub'] = np.tanh(np.mean(np.arctanh(pairwise_values_other)))
    row['res_1_MD_sub_z'] = np.mean(np.arctanh(pairwise_values_other))

#     row['res_1_simi_other_max'] = np.max(pairwise_values_other)
    
    if len(oppo_cond_embeds) > 1:
        pairwise_values_oppo = 1 - (distance.cdist(res_embed, oppo_cond_embeds, 'cosine'))
        
        row['res_1_MAD_oppo_sub'] = np.tanh(np.median(np.arctanh(pairwise_values_oppo)))
        row['res_1_MAD_oppo_sub_z'] = np.median(np.arctanh(pairwise_values_oppo))
    
        row['res_1_MD_oppo_sub'] = np.tanh(np.mean(np.arctanh(pairwise_values_oppo)))
        row['res_1_MD_oppo_sub_z'] = np.mean(np.arctanh(pairwise_values_oppo))
#     print(row.name)
    return row

df = df.apply(add_scores, axis=1)
df.head(1)

Unnamed: 0_level_0,session,trial,segment,story,segment_num,condition,with_char,sub,sub_session,res_1,res_1_simi_info,res_1_simi_info_z,res_1_simi_self,res_1_simi_self_z,res_1_simi_other,res_1_simi_other_z,res_1_MAD,res_1_MAD_z,res_1_MD,res_1_MD_z,res_1_MAD_sub,res_1_MAD_sub_z,res_1_MD_sub,res_1_MD_sub_z,res_1_MAD_oppo_sub,res_1_MAD_oppo_sub_z,res_1_MD_oppo_sub,res_1_MD_oppo_sub_z,res_1_score,res_1_conf,res_2,res_2_score,res_2_conf,res_3,res_3_score,res_3_conf,version,cond_1,cond_direction,cond_amount,base_segment,base_seg_num,segment_pair,segment_count,label,characters,hit_char,res_1_simi_otherseg_01,res_1_simi_otherseg_02,res_1_simi_otherseg_03,res_1_simi_otherseg_04,res_1_simi_otherseg_05,res_1_simi_otherseg_06,res_1_simi_otherseg_07,res_1_simi_otherseg_08,res_1_simi_otherseg_09,res_1_simi_otherseg_10,res_1_simi_otherseg_11,hit,scenes,note,match_seg_num
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1
0,3,1,1_01,1,1,p_recall,True,0116-1,0116-1,Beth and Rob move into a new house. Beth meets Sheila and Leo as she arrives to see the house for the first time. She promises to invite them over after they finish moving in.,,,,,0.827796,1.181093,0.823528,1.167684,0.828939,1.184737,0.827796,1.181093,0.8242,1.169776,0.854203,1.271497,0.848648,1.251299,,5.0,,,,,,,exp,recall_p,f,p_recall/r_recall,1_01,1,,0,1_01 : p_recall,"Beth, Sheila, Rob, Leo",True,0.827166,0.704633,0.634748,0.69256,0.651703,0.548484,0.663553,0.607671,0.534974,0.535476,0.622353,0,101 102 152 153,,1


In [21]:
df['is_seg_matched'] = df['segment_num'] == df['match_seg_num']
df.query("cond_1==['recall_p','recall_r']")['is_seg_matched'].value_counts()


True     1088
False      15
Name: is_seg_matched, dtype: int64

In [22]:
df.to_csv('../data/exp_embed_use_large.csv', index=True, index_label='no')