<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Script-for-creating-the-loc_dict" data-toc-modified-id="Script-for-creating-the-loc_dict-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Script for creating the loc_dict</a></span></li></ul></div>

## Script for creating the loc_dict

The `loc_dict` maps location tuples `(lat, lon)` to survey data.

```
{
    (lat, lon) : {
        'cluster': int64,
        'wealthpooled': float64,
        'wealth': float64,
        'year': int64,
        'country': str,
        'country_year': str,
        'households': int64,
        'urban': int64
    }
}
```

- `lat` and `lon` have type `np.float32`
- `country` and `country_year` are all lowercase, with underscores `_` in place of spaces
- `country_year` has an underscore `_` separating the country name from the year

In [1]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from pprint import pprint

csv_path = '/atlas/group/poverty_data/surveys/wealthpooled_chris/original_survey_file/AllCountryWealthIndex.csv'
df = pd.read_csv(csv_path, float_precision='high')

df.rename(columns={'LATNUM': 'lat', 'LONGNUM': 'lon', 'URBAN_RURA': 'urban'}, inplace=True)

# drop all rows that have NaN in any of our desired columns
notna_cols = ['svyid', 'lat', 'lon', 'cluster', 'wealthpooled', 'wealth', 'year', 'country', 'households', 'urban']
df = df.dropna(axis=0, subset=notna_cols, how='any')

# convert 'urban' column from string to int: 'U' (urban) => 1, 'R' (rural) => 0
df['urban'] = df['urban'].map({'U': 1, 'R': 0})

df = df[~df['svyid'].str.endswith('a')]

df

Unnamed: 0,cluster,svyid,wealthpooled,wealthpooled5country,wealth,iso3,hv000,year,cname,country,region,iso3n,households,lat,lon,urban
0,1,AO2011,2.595618,2.375126,1.713497,AGO,AO5,2011,AO,Angola,Middle Africa,24,36,-12.350257,13.534922,1
1,2,AO2011,2.209620,1.999398,1.545335,AGO,AO5,2011,AO,Angola,Middle Africa,24,32,-12.360865,13.551494,1
2,3,AO2011,0.906469,0.677429,0.631730,AGO,AO5,2011,AO,Angola,Middle Africa,24,36,-12.613421,13.413085,1
3,4,AO2011,1.105359,0.890651,0.826273,AGO,AO5,2011,AO,Angola,Middle Africa,24,35,-12.581454,13.397711,1
4,5,AO2011,1.879344,1.662076,1.293282,AGO,AO5,2011,AO,Angola,Middle Africa,24,37,-12.578135,13.418748,1
5,6,AO2011,1.749317,1.536088,1.153628,AGO,AO5,2011,AO,Angola,Middle Africa,24,27,-12.575305,13.408575,1
6,7,AO2011,0.182361,-0.025417,0.072375,AGO,AO5,2011,AO,Angola,Middle Africa,24,33,-11.188879,13.838995,1
7,8,AO2011,0.691216,0.478429,0.348369,AGO,AO5,2011,AO,Angola,Middle Africa,24,30,-11.200050,13.832946,1
8,9,AO2011,0.527798,0.357196,0.341848,AGO,AO5,2011,AO,Angola,Middle Africa,24,36,-8.593237,13.643248,1
9,10,AO2011,0.891160,0.692653,0.743994,AGO,AO5,2011,AO,Angola,Middle Africa,24,35,-8.583901,13.664315,1


In [2]:
df.loc[:, 'lat'] = df.loc[:, 'lat'].astype(np.float32)
df.loc[:, 'lon'] = df.loc[:, 'lon'].astype(np.float32)
print(df.dtypes)
print(len(df))
num_dup_locs = np.sum(df.duplicated(subset=['lat', 'lon']))
print(num_dup_locs)

cluster                   int64
svyid                    object
wealthpooled            float64
wealthpooled5country    float64
wealth                  float64
iso3                     object
hv000                    object
year                      int64
cname                    object
country                  object
region                   object
iso3n                     int64
households                int64
lat                     float32
lon                     float32
urban                     int64
dtype: object
26607
0


In [3]:
loc_dict = {}
survey_ids = set(df["svyid"])

output_cols = ['cluster', 'wealthpooled', 'wealth', 'year', 'country', 'country_year', 'households', 'urban']

def svyid_to_year(x):
    digits = [str(i) for i in range(10)]
    return int("".join([c for c in x if c in digits]))

for i, survey_id in enumerate(survey_ids):
    survey_data = df.loc[df['svyid'] == survey_id].copy()

    country = survey_data["country"].iloc[0]
    year = svyid_to_year(survey_id)
    assert np.all(survey_data["country"] == country)

    # within the country name, replace spaces (" ") and apostrophes ("'") with underscores ("_")
    country = country.lower().replace(" ", "_").replace("'", "_")
    country_year = "{}_{}".format(country, year)
    
    survey_data.loc[:, 'country'] = country
    survey_data.loc[:, 'country_year'] = country_year

    if survey_id[-1] == "a":
        print("{} is AIS, skipping".format(survey_id))
        continue

    num_dup_locs = np.sum(survey_data.duplicated(subset=['lat', 'lon']))
    print('{}, {}, num_dup_locs: {}'.format(country, year, num_dup_locs))
    assert num_dup_locs == 0

    survey_data.set_index(['lat', 'lon'], inplace=True, verify_integrity=True)
    curr_loc_dict = survey_data[output_cols].to_dict('index')
    
    for (lat64, lon64), v in curr_loc_dict.items():
        loc = (np.float32(lat64), np.float32(lon64))
        if loc in loc_dict:
            raise ValueError("unexpected duplicate loc")
        else:
            loc_dict[loc] = v

malawi, 2012, num_dup_locs: 0
tanzania, 2010, num_dup_locs: 0
ghana, 2008, num_dup_locs: 0
nigeria, 2015, num_dup_locs: 0
kenya, 2014, num_dup_locs: 0
zimbabwe, 2010, num_dup_locs: 0
senegal, 2010, num_dup_locs: 0
lesotho, 2009, num_dup_locs: 0
mozambique, 2009, num_dup_locs: 0
uganda, 2009, num_dup_locs: 0
tanzania, 2007, num_dup_locs: 0
mali, 2012, num_dup_locs: 0
lesotho, 2014, num_dup_locs: 0
uganda, 2014, num_dup_locs: 0
nigeria, 2013, num_dup_locs: 0
uganda, 2011, num_dup_locs: 0
malawi, 2015, num_dup_locs: 0
tanzania, 2004, num_dup_locs: 0
democratic_republic_of_congo, 2007, num_dup_locs: 0
senegal, 2012, num_dup_locs: 0
burkina_faso, 2014, num_dup_locs: 0
kenya, 2015, num_dup_locs: 0
togo, 2013, num_dup_locs: 0
guinea, 2012, num_dup_locs: 0
mali, 2015, num_dup_locs: 0
tanzania, 2011, num_dup_locs: 0
kenya, 2008, num_dup_locs: 0
benin, 2012, num_dup_locs: 0
malawi, 2014, num_dup_locs: 0
ghana, 2014, num_dup_locs: 0
mozambique, 2011, num_dup_locs: 0
sierra_leone, 2013, num_dup_lo

In [5]:
import sys
sys.getsizeof(loc_dict)

1310816

In [6]:
import pickle
loc_dict_path = '/atlas/group/poverty_data/surveys/wealthpooled_chris/loc_dict.pkl'
with open(loc_dict_path, 'wb') as f:
    pickle.dump(loc_dict, f)