In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import difflib as dfl

In [2]:
# Get the files

path = '../Data/Health_facilities/data_used/MFL-Dec-19.csv'
shape_path = '../Data/Mapping layout/Admin3/uga_admbnda_adm3_UBOS_v5_cleaned [CB].shp'

hos_19 = pd.read_csv(path,encoding = "cp1252")
shape = gpd.read_file(shape_path)

In [126]:
# Isolate the facility data we are interested in 

hos_19_pfp = hos_19.loc[(hos_19['Ownership'] == 'PFP')&(hos_19['Operional Status'] == 'Functional')].copy()
hos_19_pfp.rename(columns = {'Sub county' : 'Sub_county'},inplace=True)
print ('We focus on PFP facilities, which account for a total of ' + str(len(hos_19_pfp)) + ' functionnal facilities')

We focus on PFP facilities, which account for a total of 3035 functionnal facilities


In [127]:
# Clean the district and facility names

hos_19_pfp['district_clean']=hos_19_pfp.copy().District.str.replace(' District','')
hos_19_pfp['sub_county_clean']=hos_19_pfp.copy().Sub_county.str.replace(' Subcounty','')
hos_19_pfp.drop(['District','Sub_county','Sub county UID','Health Facility UID','Operional Status'],axis=1,inplace=True)

In [128]:
# Create the matching codes

shape['code']=shape.ADM1_EN.apply(lambda x: x[0:5])+"/"+shape['ADM3_EN']
hos_19_pfp['code']=hos_19_pfp.district_clean.apply(lambda x: x[0:5])+"/"+hos_19_pfp['sub_county_clean']
hos_19_pfp['code'] = hos_19_pfp['code'].str.upper() 


In [129]:
# Here I do th fuzzy matching 

# Still needs some attantion as it still fails to locate 15%+ of the dataset

codes = []
certainty = []

for code in hos_19_pfp['code']:
    match_code = dfl.get_close_matches(code, shape['code'],n=1,cutoff=0.69)
    if len(match_code) > 0:
        codes.append(match_code[0])
        score = dfl.SequenceMatcher(None, code, match_code[0]).ratio()
        certainty.append(score)
    else :
        codes.append(None)
        certainty.append(None)

hos_19_pfp['match_code'] = codes
hos_19_pfp['certainty'] = certainty

print('This gives us a percentage of null of ' + str(round((hos_19_pfp['match_code'].isnull().sum()/len(hos_19_pfp))*100,2)) + " %",
     '\nFor a total number of null values of ' + str(hos_19_pfp['match_code'].isnull().sum()))

This gives us a percentage of null of 8.83 % 
For a total number of null values of 268


In [130]:
# Add the corresponding PCODES from the shape admin files


shape_code_only = shape[['code','ADM3_PCODE','geometry']].set_index('code')

hos_19_pfp=pd.merge(hos_19_pfp,shape_code_only,how='left',left_on='match_code',right_on='code')



In [131]:
# Now clean the rows from null 'geometry' values - i.e. my failed matches - and exclude outlier facility types 

hos_19_pfp_clean = hos_19_pfp.loc[(hos_19_pfp['geometry'].notnull())
                                  &(hos_19_pfp['Level'] != 'No')
                                  &(hos_19_pfp['Level'] != 'Drug Shop')].copy()

In [132]:
len(hos_19_pfp_clean)

2701

In [133]:
# Extracting the lat and long coordinates

lon = []
lat = []

for hosp in list(hos_19_pfp_clean.index):
    x = hos_19_pfp_clean.loc[hosp,'geometry'].centroid.x
    y = hos_19_pfp_clean.loc[hosp,'geometry'].centroid.y
    lon.append(x)
    lat.append(y)

hos_19_pfp_clean['lon'] = lon
hos_19_pfp_clean['lat'] = lat

In [134]:
hos_19_pfp_clean.head()

Unnamed: 0,Region,Health Facility,Level,Ownership,Authority,district_clean,sub_county_clean,code,match_code,certainty,ADM3_PCODE,geometry,lon,lat
0,Karamoja,Arembwola HC II,HC II,PFP,Private,Abim,Abim,ABIM/ABIM,ABIM/ABIM,1.0,UG314101,"POLYGON ((33.53202 2.70592, 33.53191 2.70617, ...",33.616027,2.741836
1,West Nile,Anzoa Medical Bureau HC III,HC III,PFP,Private,Adjumani,Adjumani Town Council,ADJUM/ADJUMANI TOWN COUNCIL,ADJUM/ADJUMANI TOWN COUNCIL,1.0,UG301107,"POLYGON ((31.80801 3.37272, 31.80798 3.37259, ...",31.78784,3.375797
4,West Nile,Maaji C HC II,HC II,PFP,Private,Adjumani,Ukusijoni,ADJUM/UKUSIJONI,ADJUM/OKUSIJONI,0.933333,UG301114,"POLYGON ((31.40431 2.98945, 31.40430 2.98946, ...",31.570216,3.100765
10,Lango,Alelluyah Joint Maternity Clinic,HC II,PFP,Private,Alebtong,Aloi,ALEBT/ALOI,ALEBT/ALOI,1.0,UG323137,"POLYGON ((33.13058 2.26648, 33.13117 2.26675, ...",33.203181,2.29901
11,Lango,Ocan Community Clinic,HC II,PFP,Private,Alebtong,Awei,ALEBT/AWEI,ALEBT/AWEI,1.0,UG323140,"POLYGON ((33.13440 2.13399, 33.13419 2.13418, ...",33.197595,2.171717


In [135]:
hos_pfp_export = hos_19_pfp_clean[['Health Facility','Level','Ownership','lon','lat']].copy()
hos_pfp_export.to_csv('../Data/Health_facilities/data_used/pfp_facilities_map.csv')
hos_19_pfp_clean.drop('geometry',axis=1).to_csv('../Data/Health_facilities/Old/Test_pfp_facilities.csv')

Outstanding issues :
- The issue with using just centroid is that it will give us a bunvh of hospitals in the middle of Lake Victoria 
- I still leave 9%+ of values on the side withy my fuzzy matching, with maybe mismatches in there 

# Now mixing with the DHIS2 Facility data

In [136]:
hos_gvt_nfp_path = '../Data/Health_facilities/data_used/Facilities_DHIS2_20160412.csv'
hos_gvt_nfp_full = pd.read_csv(hos_gvt_nfp_path)#,encoding = "cp1252")

hos_gvt_nfp = hos_gvt_nfp_full.loc[(hos_gvt_nfp_full.ownership != 'Private For Profit')&
                                   (hos_gvt_nfp_full.status != 'Non Functional'),
                                   ['name','type','ownership','Lat','Long']]

hos_gvt_nfp.columns = ['Health Facility', 'Level', 'Ownership','lat', 'lon']
hos_gvt_nfp.to_csv('../Data/Health_facilities/data_used/gvt_nfp_facilities_map.csv')


In [137]:
hosp=pd.concat([hos_gvt_nfp,hos_pfp_export],ignore_index=True)

In [138]:
hos_19_pfp_clean['Level'].unique()
hos_gvt_nfp['Level'].unique()

array(['HC III', 'General Hospital', 'HC II', 'Clinic', 'HC IV', nan,
       'NR Hospital', 'RR Hospital'], dtype=object)

In [139]:
# Based on amnaula check, teh ~50 nan values are at +95% HC II

hosp['Level'].where(hosp['Level'].notnull(),'HC II',inplace=True)

In [140]:
hosp['Level'].unique()

array(['HC III', 'General Hospital', 'HC II', 'Clinic', 'HC IV',
       'NR Hospital', 'RR Hospital', 'Clinc', 'Hospital',
       'Special Clincs'], dtype=object)

In [141]:
Level_dict = {'HC II': 'Health Centre II', 
              'HC III': 'Health Centre III', 
              'Clinc': 'Clinic',
              'HC IV': 'Health Centre IV',
              'Special Clincs': 'Clinic',
              'General Hospital':'Hospital',
              'NR Hospital':'National Referral Hospital', 
              'RR Hospital':'Regional Referral Hospital'}

In [142]:
hosp.replace({'Level' : Level_dict },inplace=True)

In [143]:
hosp['Level'].unique()

array(['Health Centre III', 'Hospital', 'Health Centre II', 'Clinic',
       'Health Centre IV', 'National Referral Hospital',
       'Regional Referral Hospital'], dtype=object)

In [144]:
hosp.to_csv('../Data/Health_facilities/hospital_map.csv')
hosp.to_csv('../../SPRINT_2/Data/Hospitals/hospital_map.csv')