### HRSL countries from: https://ciesin.columbia.edu/data/hrsl/

In [44]:
hrsl_countries = """
Algeria
Burkina Faso
Cambodia
Ghana
Haiti
Ivory Coast
Kenya
Madagascar
Malawi
Mexico
Mozambique
The Philippines
Puerto Rico
Rwanda
South Africa
Sri Lanka
Tanzania
Thailand
Uganda
""".split("\n")
hrsl_countries

hrsl_countries_set = set(hrsl_countries)

### MICS countries

In [45]:
from pathlib import Path

mics_data = Path("../data/external/MICS/")

mics_countries = [d.name.split("_")[0] for d in mics_data.iterdir()]
mics_countries_set = set(mics_countries)

In [46]:
set(hrsl_countries) & set(mics_countries) ^ set(["Some Kenya"])

{'Malawi', 'Mexico', 'Some Kenya'}

### DHS countries
Copied from:
https://dhsprogram.com/data/File-Types-and-Names.cfm#CP_JUMP_10136

In [1]:
import pandas as pd
import numpy as np

raw_df = pd.read_clipboard()

new_df = pd.concat([raw_df[['Code', 'Country Name']], raw_df[['Code.1', 'Country Name.1']]])

new_df['code'] = np.where(pd.isnull(new_df.Code),
                       new_df['Code.1'],
                       new_df.Code)

new_df['country'] = np.where(pd.isnull(new_df['Country Name']),
                       new_df['Country Name.1'],
                       new_df['Country Name'])

new_df.drop(['Code', 'Code.1', 'Country Name', 'Country Name.1'], axis=1, inplace=True)
new_df

Unnamed: 0,code,country
0,AF,Afghanistan
1,AL,Albania
2,AO,Angola
3,AM,Armenia
4,AZ,Azerbaijan
5,BD,Bangladesh
6,BJ,Benin
7,BO,Bolivia
8,BT,Botswana
9,BR,Brazil


In [7]:
dhs_set = set(new_df.country.tolist())

In [48]:
len(dhs_set & set(mics_countries))

NameError: name 'dhs_set' is not defined

In [49]:
len(dhs_set & set(hrsl_countries))

NameError: name 'dhs_set' is not defined

In [50]:
dfs = pd.read_html("https://dhsprogram.com/data/available-datasets.cfm", header=0)

In [51]:
hyphen_yr_re = re.compile("[0-9]{4}\-([0-9]{2})")
yr_re = re.compile("[0-9]{4}")
    
m = hyphen_yr_re.search("asdf 2016-17 (55)")
m.groups()[0]

'17'

In [52]:
import re

def get_survey_year(s):
    # remove footnotes
    s = re.sub("\([0-9]+\)", "", s)
    
    # get later year from hyphenated
    hyphen_yr_re = re.compile("[0-9]{4}\-([0-9]{2})")
    yr_re = re.compile("([0-9]{4})")
    
    
    if hyphen_yr_re.search(s):
        re_match = hyphen_yr_re.search(s)
        two_digit = re_match.groups()[0]
        if int(two_digit) > 20:
            return "19" + two_digit
        else:
            return "20" + two_digit
    elif yr_re.search(s):
        re_match = yr_re.search(s)
        return re_match.groups()[0]
    else:
        print("No match ", s)
        return "None"
    


dhs_surveys = pd.concat(dfs)


dhs_surveys['year'] = dhs_surveys.Survey.apply(get_survey_year)

In [53]:
dhs_surveys[(dhs_surveys.year >= "2015") & (dhs_surveys['Survey Datasets'] == 'Data Available')]

Unnamed: 0,Survey,Type,Phase,Recode,Survey Datasets,GPS Datasets,HIV/Other Biomarkers Datasets,SPA Datasets,year
0,Afghanistan 2015,Standard DHS,DHS-VII,DHS-VII,Data Available,Not Yet Available,Not Collected,Not Applicable,2015
0,Angola 2015-16,Standard DHS,DHS-VII,DHS-VII,Data Available,Data Available,Data Available,Not Applicable,2016
0,Armenia 2015-16,Standard DHS,DHS-VII,DHS-VII,Data Available,Data Available,Not Collected,Not Applicable,2016
0,Chad 2014-15,Standard DHS,DHS-VII,DHS-VI,Data Available,Data Available,Data Available,Not Applicable,2015
0,Colombia 2015,Standard DHS,DHS-VII,DHS-VII,Data Available,Not Yet Available,Not Collected,Not Applicable,2015
0,Egypt 2015,Special,DHS-VII,--,Data Available,Not Distributed,Not Yet Available,Not Applicable,2015
0,Ethiopia 2016,Standard DHS,DHS-VII,DHS-VII,Data Available,Data Available,Data Available,Not Applicable,2016
0,Ghana 2016,MIS,DHS-VII,DHS-VII,Data Available,Data Available,Not Collected,Not Applicable,2016
0,Guatemala 2014-15 (22),Standard DHS,DHS-VII,DHS-VI,Data Available,Data Available,Not Collected,Not Applicable,2015
0,India 2015-16,Standard DHS,DHS-VII,DHS-VI,Data Available,Data Available,Data Available,Not Applicable,2016


In [54]:
def get_survey_country(s):
    c = re.split("[0-9\(]", s)[0].strip()
    return c

dhs_surveys['country'] = dhs_surveys.Survey.apply(get_survey_country)

In [55]:
relevant = dhs_surveys[(dhs_surveys.year >= "2015") & (dhs_surveys['Survey Datasets'] == 'Data Available')]

In [56]:
relevant_dhs_set = set(relevant.country.tolist())

# Overlaps

In [57]:
relevant_dhs_set

{'Afghanistan',
 'Angola',
 'Armenia',
 'Chad',
 'Colombia',
 'Egypt',
 'Ethiopia',
 'Ghana',
 'Guatemala',
 'India',
 'Kenya',
 'Liberia',
 'Madagascar',
 'Malawi',
 'Mali',
 'Myanmar',
 'Nepal',
 'Nigeria',
 'Rwanda',
 'Senegal',
 'Sierra Leone',
 'Tanzania',
 'Uganda',
 'Zimbabwe'}

In [58]:
mics_countries_set

{'Benin',
 'Cameroon',
 'Cuba',
 'Dominican Republic',
 'Egypt (Sub-national)',
 'El Salvador',
 'Guinea Bissau',
 'Guyana',
 'Kazakhstan',
 'Kenya (Bungoma County)',
 'Kenya (Kakamega County)',
 'Kenya (Turkana County)',
 'Kyrgyzstan',
 'Malawi',
 'Mali',
 'Mexico',
 'Mongolia',
 'Nepal',
 'Nigeria',
 'Pakistan (Punjab)',
 'Pakistan (Sindh)',
 'Paraguay',
 'Sao Tome and Principe',
 'Serbia',
 'Sudan',
 'Swaziland',
 'Turkmenistan',
 'Viet Nam',
 'Zimbabwe'}

In [61]:
hrsl_countries_set

{'',
 'Algeria',
 'Burkina Faso',
 'Cambodia',
 'Ghana',
 'Haiti',
 'Ivory Coast',
 'Kenya',
 'Madagascar',
 'Malawi',
 'Mexico',
 'Mozambique',
 'Puerto Rico',
 'Rwanda',
 'South Africa',
 'Sri Lanka',
 'Tanzania',
 'Thailand',
 'The Philippines',
 'Uganda'}

In [60]:
mics_countries_set & relevant_dhs_set

{'Malawi', 'Mali', 'Nepal', 'Nigeria', 'Zimbabwe'}

In [62]:
mics_countries_set & hrsl_countries_set

{'Malawi', 'Mexico'}

In [63]:
relevant_dhs_set & hrsl_countries_set

{'Ghana', 'Kenya', 'Madagascar', 'Malawi', 'Rwanda', 'Tanzania', 'Uganda'}

In [64]:
relevant_dhs_set & hrsl_countries_set & mics_countries_set

{'Malawi'}