In [1]:
import configparser
import sys
import json
import pandas as pd
import numpy as np
import random

sys.path.append('../WACs/WAC_Utils')
from wac_utils import filter_refdf_by_filelist, filter_relational_expr

In [2]:
config = configparser.ConfigParser()
my_config = '../Config/default.cfg'
with open(my_config, 'r', encoding='utf-8') as f:
    config.read_file(f)
    
dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')
preproc_path = dsgv_home + '/Preproc/PreprocOut/'
translate_path = dsgv_home + '/Preproc/Translate/'

In [4]:
with open(preproc_path + 'saiapr_90-10_splits.json', 'r') as f:
    s_splits = json.load(f)

# Referring expressions data
refdf = pd.read_json(preproc_path + 'saiapr_refdf.json.gz',
                            typ='frame', orient='split', compression='gzip')
# EN test set
en_test = filter_refdf_by_filelist(refdf, s_splits['test'])
en_test = filter_relational_expr(en_test).drop(columns='tagged')
print('EN test set shape:', en_test.shape)

# Translated expressions
fr_refexp = pd.read_csv(translate_path + 'FR_testset.csv', sep=',', index_col=0, header=0)
print('Translated refexp shape:', fr_refexp.shape)

fr_all = en_test.merge(fr_refexp, how='left', left_index=True, right_index=True, suffixes=('_EN', None))
fr_all = filter_relational_expr(fr_all, lang='FR')
fr_all.tail()

EN test set shape: (10370, 6)
Translated refexp shape: (10376, 1)


Unnamed: 0,i_corpus,image_id,region_id,r_corpus,rex_id,refexp_EN,refexp
12005,0,40508,1,referit,116219,blue shirt right,chemise bleue droite
12006,0,17169,1,referit,116923,outside the moon,en dehors de la lune
12007,0,30950,1,referit,118084,water,l'eau
12008,0,32289,1,referit,118143,group of people at bottom,groupe de personnes en bas
12009,0,30069,1,referit,119663,pavement very bottom,trottoir très bas


In [5]:
save = False

if save = True:
    fr_all.to_pickle(preproc_path + 'FR_small_dataset.pkl')

In [6]:
df = pd.read_pickle(preproc_path + 'FR_small_dataset.pkl')
df

Unnamed: 0,i_corpus,image_id,region_id,r_corpus,rex_id,refexp_EN,refexp
0,0,14576,1,referit,2,seal,phoque
2,0,14576,2,referit,96551,bottom left corner,coin inférieur gauche
3,0,20909,1,referit,49,kid,enfant
4,0,20909,3,referit,35225,dirt ground,terre battue
5,0,20909,2,referit,112956,the sky,Le ciel
...,...,...,...,...,...,...,...
12005,0,40508,1,referit,116219,blue shirt right,chemise bleue droite
12006,0,17169,1,referit,116923,outside the moon,en dehors de la lune
12007,0,30950,1,referit,118084,water,l'eau
12008,0,32289,1,referit,118143,group of people at bottom,groupe de personnes en bas


In [31]:
import random

print('Length:', len(s_splits['test']))
print('1st 10 pre-shuffle:', s_splits['test'][:10])
random.shuffle(s_splits['test'])
print('1st 10 post-shuffle:', s_splits['test'][:10])

Length: 2000
1st 10 pre-shuffle: [14814, 4834, 13452, 12290, 8632, 8329, 9994, 17376, 19882, 39702]
1st 10 post-shuffle: [32827, 37235, 4424, 30310, 10897, 16431, 692, 38807, 27566, 17515]


In [32]:
whole = s_splits['test']

fr_splits = {
    'train': whole[:500],
    'val': whole[500:1000],
    'test': whole[1000:]
}
fr_splits['train']

[32827,
 37235,
 4424,
 30310,
 10897,
 16431,
 692,
 38807,
 27566,
 17515,
 39436,
 808,
 30301,
 6504,
 19973,
 32276,
 37559,
 21523,
 6207,
 23083,
 7472,
 1419,
 8956,
 9017,
 21053,
 31297,
 13343,
 40659,
 3414,
 31551,
 3353,
 4837,
 12173,
 14706,
 30050,
 22700,
 32863,
 25766,
 15758,
 40444,
 10011,
 18229,
 9994,
 11469,
 5023,
 12050,
 38966,
 2690,
 23302,
 11298,
 18423,
 17267,
 23520,
 40639,
 19347,
 26522,
 21477,
 9723,
 34198,
 25623,
 16081,
 26368,
 1715,
 23755,
 20114,
 21221,
 11574,
 31610,
 12548,
 4739,
 14233,
 18051,
 2431,
 3920,
 7038,
 24461,
 10048,
 7463,
 1303,
 7747,
 4170,
 26899,
 10186,
 32148,
 26566,
 18134,
 9581,
 37766,
 32584,
 2178,
 12290,
 27665,
 5205,
 21669,
 32257,
 16569,
 2617,
 26733,
 31334,
 39431,
 35860,
 9790,
 31719,
 31092,
 38169,
 7560,
 5124,
 27066,
 9776,
 6400,
 13702,
 31563,
 10714,
 32150,
 14409,
 21433,
 38740,
 21091,
 26432,
 14298,
 3312,
 9248,
 30955,
 4898,
 3105,
 32384,
 13104,
 10984,
 11406,
 10656,


In [34]:
save = False

if save = True:
    with open(preproc_path + 'fr_split.json', 'w') as f:
        json.dump(fr_splits, f)