# Create small DHS dataset

Sub-sample a part of the DHS dataset and corresponding embeddings to speed up processing time. We'll use 10k points for now.

In [1]:
cd ../

/cephyr/users/markpett/Alvis/ImputeAwareATE


In [2]:
import os
import pandas as pd
import numpy as np
import configparser

SUBSET_SIZE = 10000
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
# Read config file
config = configparser.ConfigParser()
config.read('config.ini')

DATA_DIR = config['PATHS']['DATA_DIR']

embeddings = np.load(os.path.join(DATA_DIR, 'ssl4eo_resnet50.npy'))
df = pd.read_csv(os.path.join(DATA_DIR, 'dhs_with_imgs.csv'))

assert embeddings.shape[0] == df.shape[0], 'Mismatch between embeddings and metadata rows.'

df

Unnamed: 0,cluster_id,lon,lat,rural,region_id,country,survey,month,year,iwi
0,AO.Bengo.71.135,13.640789,-8.589805,False,AO.Bengo,Angola,Angola 2015-16 Standard DHS,11,2015,62.334459
1,AO.Bengo.71.158,14.122619,-7.718385,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,2,2016,8.226589
2,AO.Bengo.71.169,13.654425,-8.592545,False,AO.Bengo,Angola,Angola 2015-16 Standard DHS,10,2015,62.760211
3,AO.Bengo.71.203,13.517859,-8.652260,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,1,2016,68.211697
4,AO.Bengo.71.208,13.721998,-7.852511,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,11,2015,14.825944
...,...,...,...,...,...,...,...,...,...,...
68614,ZW.Midlands.72.37,30.008579,-20.911177,True,ZW.Midlands,Zimbabwe,Zimbabwe 2015 Standard DHS,9,2015,27.791567
68615,ZW.Midlands.72.52,29.860028,-20.402214,True,ZW.Midlands,Zimbabwe,Zimbabwe 2015 Standard DHS,10,2015,36.929878
68616,ZW.Midlands.72.69,30.172833,-20.724753,True,ZW.Midlands,Zimbabwe,Zimbabwe 2015 Standard DHS,10,2015,24.406326
68617,ZW.Midlands.72.91,29.820084,-19.453466,False,ZW.Midlands,Zimbabwe,Zimbabwe 2015 Standard DHS,7,2015,59.887344


In [7]:
sample_ixs = np.random.choice(df.index, SUBSET_SIZE, replace=False)

small_df = df.iloc[sample_ixs]
small_embeddings = embeddings[sample_ixs]

assert small_embeddings.shape[0] == small_df.shape[0], 'Mismatch between embeddings and metadata rows.'

small_df

Unnamed: 0,cluster_id,lon,lat,rural,region_id,country,survey,month,year,iwi
24566,GN.Mamou.53.6,-12.067588,10.384562,False,GN.Mamou,Guinea,Guinea 2005 Standard DHS,6,2005,43.994376
42456,MZ.Maputo City.7A.1105,32.560665,-25.927313,False,MZ.Maputo City,Mozambique,Mozambique 2018 MIS,3,2018,58.107414
58238,TG.Agglomération De Lomé.71.12,1.203508,6.232340,False,TG.Agglomération De Lomé,Togo,Togo 2017 MIS,10,2017,52.556917
9590,CM.Yaoundé.61.396,9.740145,4.010560,False,CM.Yaoundé,Cameroon,Cameroon 2011 Standard DHS,3,2011,50.404297
28847,KE.Siaya.8B.1451,34.295378,-0.269201,True,KE.Siaya,Kenya,Kenya 2022 Standard DHS,3,2022,33.228676
...,...,...,...,...,...,...,...,...,...,...
60878,TZ.Mtwara.6A.472,39.351897,-10.979975,True,TZ.Mtwara,Tanzania,Tanzania 2011-12 Standard AIS,1,2012,16.522514
61936,TZ.Simiyu.7B.496,34.494628,-3.557098,True,TZ.Simiyu,Tanzania,Tanzania 2015-16 Standard DHS,1,2016,13.704596
34447,MD.High Plateaux.6A.76,47.424541,-20.284811,True,MD.High Plateaux,Madagascar,Madagascar 2013 MIS,4,2013,14.156881
8901,CM.Ouest.45.129,10.541408,5.135607,False,CM.Ouest,Cameroon,Cameroon 2004 Standard DHS,7,2004,37.210448


In [4]:
small_df.to_csv(os.path.join(DATA_DIR, 'small_dhs.csv'), index=False)
np.save(os.path.join(DATA_DIR, 'small_ssl4eo_resnet50.npy'), small_embeddings)