In [83]:
import pandas as pd 
import sys
import os
import numpy as np

sys.path.append('../generalization')

from helpers import get_colors, get_tastes
from utils import load_yaml, display_df
from data_processing.abstract_lang import load_arbitrary_path

In [2]:
config = load_yaml('config.yaml')

In [25]:
lang = 'de'
colors = get_colors(lang)
english_norms_path = os.path.join('..', config['norms_dir'], config['norms_paths'][lang])

eng = load_arbitrary_path(english_norms_path, sep='\t', header=None, names=['concept_og', 'dirty', 'freq'])

In [26]:
eng = eng[eng['dirty'].str.contains('|'.join(colors))]

In [27]:
def check_which_color(x):
    found_colors = []
    for c in colors:
        if c in x:
            found_colors.append(c)
    return found_colors
eng['feature_og'] = eng['dirty'].apply(lambda x: check_which_color(x))

In [28]:
eng = eng.explode('color')
eng = eng.groupby(['feature_og','color'])['freq'].sum().reset_index()

In [63]:
# dutch 
eng = eng.melt(value_vars=['asso1', 'asso2', 'asso3'], id_vars=['exemplar','participant']).drop(columns=['variable'])
eng = eng[eng['value'].str.lower() != 'x']
eng = eng.groupby(['exemplar','value'])['participant'].nunique().reset_index()
eng = eng[eng['value'].str.lower().isin(colors)]
eng = eng.rename(columns={'exemplar': 'concept_og', 'value': 'feature_og', 'participant': 'freq'})


In [18]:
eng = eng[eng['translated'].isin(colors)].groupby(['cue', 'translated'])[['frequency_translated']].sum().reset_index()
eng['feature'] = 'color'

In [25]:
eng = eng.rename(columns={'cue': 'concept_en','translated': 'feature_en', 'frequency_translated': 'freq'})

Unnamed: 0,concept_en,feature_en,freq,feature
0,algae,green,17,color
1,alligator,green,37,color
2,aluminum,silver,24,color
3,ambulance,orange,5,color
4,ambulance,red,8,color
...,...,...,...,...
739,zebra,white,96,color
740,zit,red,28,color
741,zit,white,38,color
742,zucchini,green,26,color


In [37]:
jap_path = '../data/norms/japanese/JapaneseNorms.csv'
en_path = '../data/norms/japanese/EnglishNorms.csv'

In [38]:
en = pd.read_csv(en_path)
jap = pd.read_csv(jap_path)

In [39]:
jap

Unnamed: 0,Concept,Feature,BR_Label,Prod_Freq
0,air conditioner,風が出る,encyclopaedic,8
1,air conditioner,機械,taxonomic,6
2,air conditioner,夏に使う,time,6
3,air conditioner,涼しい,touch,12
4,air conditioner,白,visual colour,9
...,...,...,...,...
1874,water,湖,location,2
1875,water,無味,taste,2
1876,water,無味無臭,taste,3
1877,water,雨,time,3


In [46]:
# en = en[en['Prod_Freq']>1]
# jap = jap[jap['Prod_Freq']>1]

In [47]:
jap[jap['Concept']=='pumpkin']

Unnamed: 0,Concept,Feature,BR_Label,Prod_Freq,en_feature
66,pumpkin,オレンジ色,visual colour,7,"['orangeness', 'orange']"
67,pumpkin,緑,visual colour,8,"['green', 'viridity', 'greenness']"
68,pumpkin,皮が緑色,visual colour,6,[]
69,pumpkin,黄色,visual colour,9,"['yellow', 'yellowness']"
70,pumpkin,中がオレンジ,visual colour,2,[]
71,pumpkin,中が黄色,visual colour,3,[]


In [32]:
jap = jap.sort_values(by='Prod_Freq', ascending=False).groupby(['Concept']).head(1)
en = en.sort_values(by='Prod_Freq', ascending=False).groupby(['Concept']).head(1)

In [33]:
en = en[en['BR_Label'] == 'visual colour']

In [36]:
jap['en_feature'] = jap['en_feature'].apply(eval)

Unnamed: 0,Concept,Feature,BR_Label,Prod_Freq,en_feature
69,pumpkin,黄色,visual colour,9,"[yellow, yellowness]"


In [37]:
en_f = en.groupby(['Concept', 'BR_Label'])['Features'].apply(set).reset_index()
jap_f = jap.groupby(['Concept', 'BR_Label'])['en_feature'].apply(list).reset_index()

In [38]:
jap_f['en_feature'] = jap_f['en_feature'].apply(lambda x: set(sum(x, [])))

In [39]:
merged = en_f.merge(jap_f[['Concept', 'en_feature']], on=['Concept'], how='inner')

In [40]:
merged = merged[merged['en_feature'].apply(len)>0]

In [41]:
# take the number of english features that are in the Japanese translations
merged['frac_overlap'] = merged.apply(lambda x: len(x['Features'].intersection(x['en_feature'])) / len(x['Features']), axis=1)

In [42]:

merged

Unnamed: 0,Concept,BR_Label,Features,en_feature,frac_overlap
0,airplane,visual colour,{white},"{colorimetric whiteness, white, whiteness}",1.0
3,bomb,visual colour,{black},"{malefactor, felon, colouring, lists of colors...",1.0
4,bread,visual colour,{brown},"{brown, brownness, Aquilus}",1.0
6,cake,visual colour,{colourful},"{colorimetric whiteness, white, whiteness}",0.0
8,chair,visual colour,{different_colours},"{brown, brownness, Aquilus}",0.0
9,cockroach,visual colour,{brown},"{malefactor, felon, colouring, lists of colors...",0.0
11,curry,visual colour,{brown},"{brown, brownness, Aquilus}",1.0
12,dandelion,visual colour,{yellow},"{yellowness, yellow}",1.0
14,eel,visual colour,{grey},"{malefactor, felon, colouring, lists of colors...",0.0
15,fence,visual colour,{white},"{grayness, grey, gray, greyness}",0.0


In [318]:
other_lang = 'nl'
feat_other = f'feature_{other_lang}_en'
feat_other_og = f'feature_{other_lang}'
concept_other = f'concept_{other_lang}' if other_lang != 'ja' else 'concept_en'
freq_other = f'freq_{other_lang}'
frac_other = f'frac_{other_lang}'
df = pd.read_csv(f'/dlabscratch1/veselovs/projects/llm-latent-language/data/norms/augmented_merge_en_{other_lang}.csv')
df = df.drop_duplicates()

In [319]:
df = df.dropna(subset=['feature_en_en'])
def try_eval(x):
    try: 
        return eval(x)
    except:
        return x
df[feat_other] = df[feat_other].apply(try_eval)

In [320]:
# NOTE: to avoid having the model have multiple feature mappings 

df = df.sort_values('freq_en', ascending=False).groupby(['concept_en', feat_other_og]).head(1)
df = df.sort_values(freq_other, ascending=False).groupby(['concept_en', 'feature_en_en']).head(1)

In [321]:
temp = df.groupby('concept_en')['freq_en'].sum()
en_concepts = temp[temp>0].index

In [322]:
df[df['concept_en']=='octopus']

Unnamed: 0,concept_en,feature_en_en,freq_en,concept_nl,feature_nl,freq_nl,feature_nl_en


In [323]:
df = df[df['concept_en'].isin(en_concepts)]

In [324]:
df['freq_en'] = df['freq_en'].fillna(0)
df[freq_other] = df[freq_other].fillna(0)

In [325]:
df['total_en'] = df.groupby(['concept_en'])['freq_en'].transform('sum')
df['total_other'] = df.groupby([concept_other])[freq_other].transform('sum')

df['frac_en'] = df['freq_en'] / df['total_en']
df[frac_other] = df[freq_other] / df['total_other']

In [326]:
df['diff'] = df[frac_other] - df['frac_en']

In [327]:
df = df.sort_values(by='diff', ascending=False)
df = df[(df['total_en'] > 2) & (df[f'total_other'] > 2)]

In [328]:
big_diff_concepts = df[df['diff'].abs() > 0.3]['concept_en'].values
big_diff = df[df['concept_en'].isin(big_diff_concepts)]

In [329]:
big_diff[big_diff['concept_en'] == np.random.choice(en_concepts)]

Unnamed: 0,concept_en,feature_en_en,freq_en,concept_nl,feature_nl,freq_nl,feature_nl_en,total_en,total_other,frac_en,frac_nl,diff
207732,kettle,white,0.0,waterkoker,wit,3.0,"[whiteness, white, colorimetric whiteness, hum...",3.0,6.0,0.0,0.5,0.5
207860,kettle,yellow,0.0,waterkoker,geel,1.0,"[yellow, yellowness]",3.0,6.0,0.0,0.166667,0.166667
207706,kettle,gray,0.0,waterkoker,grijs,1.0,"[gray, grey, greyness, grayness]",3.0,6.0,0.0,0.166667,0.166667
207610,kettle,brown,0.0,waterkoker,bruin,1.0,"[John Brown, Brown, brown, brownness, Aquilus]",3.0,6.0,0.0,0.166667,0.166667
207504,kettle,black,3.0,ketel,zwart,6.0,"[black, color, coloring, colouring, complexion...",3.0,6.0,1.0,1.0,0.0


In [330]:
big_diff.sort_values(by=['concept_en', 'diff']).head(20)

Unnamed: 0,concept_en,feature_en_en,freq_en,concept_nl,feature_nl,freq_nl,feature_nl_en,total_en,total_other,frac_en,frac_nl,diff
68736,asparagus,green,44.0,asperges,groen,3.0,"[environmental awareness, environmentalism, gr...",44.0,13.0,1.0,0.230769,-0.769231
68842,asparagus,red,0.0,asperges,rood,1.0,"[red, redness]",44.0,13.0,0.0,0.076923,0.076923
68936,asparagus,white,0.0,asperges,wit,9.0,"[whiteness, white, colorimetric whiteness, hum...",44.0,13.0,0.0,0.692308,0.692308
71674,avocado,green,19.0,advocaat,groen,1.0,"[environmental awareness, environmentalism, gr...",19.0,8.0,1.0,0.125,-0.875
71568,avocado,black,0.0,advocaat,zwart,3.0,"[black, color, coloring, colouring, complexion...",19.0,8.0,0.0,0.375,0.375
71780,avocado,yellow,0.0,advocaat,geel,4.0,"[yellow, yellowness]",19.0,8.0,0.0,0.5,0.5
81343,beetle,black,11.0,kever,zwart,7.0,"[black, color, coloring, colouring, complexion...",11.0,11.0,1.0,0.636364,-0.363636
81571,beetle,green,0.0,kever,groen,1.0,"[environmental awareness, environmentalism, gr...",11.0,11.0,0.0,0.090909,0.090909
81449,beetle,brown,0.0,kever,goud,1.0,"[gold, Au, yellow, yellowness, amber, golden b...",11.0,11.0,0.0,0.090909,0.090909
81677,beetle,red,0.0,kever,rood,2.0,"[red, redness]",11.0,11.0,0.0,0.181818,0.181818
