In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance

def to_str(num):
    if num < 10:
        return '0'+str(num)
    else:
        return str(num)

In [2]:
info = pd.read_excel('../data/rep/TheChair.xlsx', sheet_name='info')
df = pd.read_csv('../data/rep/rep.csv', index_col='no')

# add fields
df['segment'] = df.apply(
    lambda row: '3_' + to_str(row['segment_num']), axis=1)

df['cond_direction'] = np.where(df['group']=='prediction', 'f', 'b')
df.loc[df['condition'].isin(['p0','r0']), 'cond_amount'] = 'p0/r0'
df.loc[df['condition'].isin(['p','r']), 'cond_amount'] = 'p/r'

# add 'base_segment' (last watched)
df.loc[df['condition'].isin(['p0','p']), 'base_segment'] = df.apply(
    lambda row: str(row['story']) + '_' + to_str(row['segment_num']-1), axis=1)
df.loc[df['condition'].isin(['r0','r']), 'base_segment'] = df.apply(
    lambda row: str(row['story']) + '_' + to_str(row['segment_num']+1), axis=1)

df['base_seg_num'] = df['base_segment'].apply(lambda x: int(x[-2:]))

# add 'segment_pair' (from .. to ..)
df.loc[df['condition'].isin(['p0','p']), 'segment_pair'] = df.apply(
    lambda row: str(row['story']) + '_' + to_str(row['segment_num']-1) + '-' + to_str(row['segment_num']), axis=1)
df.loc[df['condition'].isin(['r0','r']), 'segment_pair'] = df.apply(
    lambda row: str(row['story']) + '_' + to_str(row['segment_num']) + '-' + to_str(row['segment_num']+1), axis=1)

# add 'segment_count' (how many segments passed)
df.loc[df['cond_direction']=='f', 'segment_count'] = df.loc[df['cond_direction']=='f', 'segment_num'] - 1
df.loc[df['cond_direction']=='b', 'segment_count'] = 13 - df.loc[df['cond_direction']=='b', 'segment_num']

# add columns for res_1_simi
for i, col in enumerate(['res_1_simi_info', 'res_1_simi_info_z',
                         'res_1_MD','res_1_MD_z',
                         'res_1_MD_sub','res_1_MD_sub_z']):
    df.insert(df.columns.get_loc('segment')+i+1, col, np.nan) 


In [3]:
print(df.shape)

df = df.query("no_char !=1 ")
print(df.shape)

df = df.query("flag != 1")
print(df.shape)

(888, 26)
(880, 26)
(878, 26)


In [4]:
def jaccard(res1, res2, mode="matched only"):
    list1 = res1.split()
    list2 = res2.split()

    if mode=="matched only":
        list1 = [e for e in list1 if e not in ['0','1','2','3']]
        list2 = [e for e in list2 if e not in ['0','1','2','3']]
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    return intersection, union

In [5]:
def add_jaccard(row, scenes1, scenes2):
    row['intersection'], row['union'] = jaccard(row[scenes1], row[scenes2])
    return row

In [6]:
df = df.apply(add_jaccard, scenes1='scenes_xx', scenes2='scenes_xz', axis=1)
df['intersection'].sum()/df['union'].sum()

0.42178447276940906

## add similarity score

In [7]:
# USE
info_embeds = np.load('../data/rep/info_embeds2.npy')
exp_embeds = np.load('../data/rep/exp_embeds2.npy')

In [8]:
def add_scores(row, df=df):
    
    # print(row.name)
    res_embed = [exp_embeds[row.name]]
    # print(row.name, row['segment'], row['condition'])
    
# info
    info_embed = info_embeds[info['segment'] == row['segment']]
    row['res_1_simi_info'] = 1 - distance.cosine(res_embed, info_embed)
    row['res_1_simi_info_z'] = np.arctanh(row['res_1_simi_info'])

# same cond median absolute difference (MAD)
    same_cond_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==row['condition']))].index ]
    same_cond_other_embeds = exp_embeds[ df[((df['segment']==row['segment']) & (df['condition']==row['condition']) & (df['sub']!=row['sub']))].index ]
        
    pairwise_values = 1 - (distance.pdist(same_cond_embeds, 'cosine'))
    pairwise_values_other = 1 - (distance.cdist(res_embed, same_cond_other_embeds, 'cosine'))
        
    row['res_1_MD'] = np.tanh(np.mean(np.arctanh(pairwise_values)))
    row['res_1_MD_z'] = np.mean(np.arctanh(pairwise_values))
    
    row['res_1_MD_sub'] = np.tanh(np.mean(np.arctanh(pairwise_values_other)))
    row['res_1_MD_sub_z'] = np.mean(np.arctanh(pairwise_values_other))

#     print(row.name)
    return row

df = df.apply(add_scores, axis=1)
df.head(1)

  row['res_1_MD'] = np.tanh(np.mean(np.arctanh(pairwise_values)))
  row['res_1_MD_z'] = np.mean(np.arctanh(pairwise_values))
  row['res_1_MD_sub'] = np.tanh(np.mean(np.arctanh(pairwise_values_other)))
  row['res_1_MD_sub_z'] = np.mean(np.arctanh(pairwise_values_other))


Unnamed: 0_level_0,sub,story,cond,res,res_corrected,scenes_xx,scenes_xz,scenes_final,group,segment_num,...,res_1_MD_sub,res_1_MD_sub_z,cond_direction,cond_amount,base_segment,base_seg_num,segment_pair,segment_count,intersection,union
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0522-1,3,1_r0,Segment one is Jiyoon preparing for her first ...,Jiyoon preparing for her first department meet...,1,103,1,retrodiction,1,...,0.305836,0.315945,b,p0/r0,3_02,2,3_01-02,12.0,0,1


In [9]:
df.to_csv('../data/rep/exp_embed_use_large.csv', index=True, index_label='no')