In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import timeit
import scipy
import datetime

from scipy import stats
from pandas import ExcelWriter
from pandas import ExcelFile

pd.set_option('display.max_columns', None)
idx = pd.IndexSlice

In [2]:
input_path = '../../data/input/dhis2/new_system/'
input_path_old = '../../data/input/dhis2/old_system/'

shapes_path = '../../data/shapes/district/districts_17_19_clean.shp'
pop_path = '../../data/input/demographics/'

output_path = '../../data/output/sprint3_analysis/'

# Fetch my data


## Set up

In [3]:
# Creating a dict of names to replace district names

district_name_dict = {'SEMBABULE': 'SSEMBABULE', 'MADI-OKOLLO': 'MADI OKOLLO', 'LUWEERO':'LUWERO'}

# Get the district correposnding to facility ids

xls = ExcelFile('../../data/input/dhis2/new_old_correspondance.xlsx')
df_dis = xls.parse(xls.sheet_names[1])


In [4]:
# build a small fucntion to split the string column name of the data download as pivot 

def split(strng, sep, occ):
    strng = strng.split(sep)
    return sep.join(strng[occ[1]:]), sep.join(strng[:occ[0]]), sep.join(strng[occ[0]:occ[1]])[:3]

In [5]:
# To clean the data downloaded in a pivot format

def get_clean_stack(df,drop):

    df['district']=df['orgunitlevel3'].apply(lambda x: x[:-9].upper())
    df['district'].replace(district_name_dict,inplace=True)
    
    df.set_index(['district','organisationunitid'],drop=True,inplace=True)
    
    cols = np.arange(0,9)
    df.drop(df.columns[cols],axis=1,inplace=True)
    
    cols = df.columns
    new_cols=[]
    for col in cols:
        new_cols.append(split(col,' ',[-2,-1]))
    df.columns=pd.MultiIndex.from_tuples(new_cols,names=['year','indic','month'])
    
    if drop != None:
        df.drop(drop,axis=1,inplace=True,level=2)
    
    df1=df.copy().stack(level=[0,1,2],dropna=False).reset_index()
    df1.rename(columns={0:'value'},inplace=True)
    
    return df1

In [6]:
# To clean the data downloaded in a slightly different pivot format

def get_clean_stack_newlayout(df,drop):

    month_dict={'01':'Jan','02':'Feb','03':'Mar','04':'Apr',
                '05':'May','06':'Jun','07':'Jul','08':'Aug',
                '09':'Sep','10':'Oct','11':'Nov','12':'Dec'}
    
    df['district']=df['orgunitlevel3'].apply(lambda x: x[:-9].upper())
    df['district'].replace(district_name_dict,inplace=True)

    df['year']=df['periodcode'].astype('str').apply(lambda x: x[:4])
    df['month']=df['periodcode'].astype('str').apply(lambda x: x[-2:]).replace(month_dict)
    
    if drop != None:
        if type(drop)==str:
            df1=df[~(df['month']==drop)].copy()
        if type(drop)==list:
            df1=df[~df['month'].isin(drop)].copy()
    else:
        df1=df.copy()
        
    df1.set_index(['district','organisationunitid','year','month'],drop=True,inplace=True)
        
    cols = np.arange(0,12)
    df1.drop(df1.columns[cols],axis=1,inplace=True)
    
    df2=df1.copy().stack(dropna=False).reset_index()
    df2.rename(columns={0:'value','level_4':'indic'},inplace=True)
    
    return df2

## New data


In [7]:
# to get all the new data files together
def fetch_new_data (filepaths,filepaths_no_dec,filepaths_newlayout):
    df = pd.DataFrame(columns = ['district', 'organisationunitid', 'year', 'indic', 'month', 'value'])
    for x in filepaths:
        x_df=get_clean_stack(pd.read_csv(x),drop='Dec')
        df=pd.concat([df,x_df])
    for x in filepaths_no_dec:
        x_df=get_clean_stack(pd.read_csv(x),drop=None)
        df=pd.concat([df,x_df])
    for x in filepaths_newlayout:
        x_df=get_clean_stack_newlayout(pd.read_csv(x),drop='Dec')
        df=pd.concat([df,x_df])
    return df

In [8]:
# To clean the data downloaded in a pivot format

def get_clean_stack_api(df,df_dis):
    
    df=pd.merge(df,df_dis,how='left',left_on='orgUnit',right_on='organisationunitid')
    
    month_dict={'01':'Jan','02':'Feb','03':'Mar','04':'Apr',
                '05':'May','06':'Jun','07':'Jul','08':'Aug',
                '09':'Sep','10':'Oct','11':'Nov','12':'Dec'}
    
    df['year']=df['period'].astype('str').apply(lambda x: x[:4])
    df['month']=df['period'].astype('str').apply(lambda x: x[-2:]).replace(month_dict)
    
    df.rename(columns={'dataElement':'indic'},inplace=True)
    df.drop(['period','orgUnit'],axis=1,inplace=True)
    
    return df

In [9]:
# creating the new data stack

filepaths = [input_path+'new_epi_data_by_facility.csv',
            input_path+'new_mnch_data_by_facility.csv',
            input_path+'new_lbw_data_by_facility.csv',
            input_path+'new_vitamin_data_by_facility.csv']

filepaths_no_dec = [input_path+'may_new_epi_data_by_facility.csv']

filepaths_newlayout = [input_path+'new_reporting_by_facility.csv',
                       input_path+'new_epi_data_addendum_by_facility.csv',
                       input_path+'may_new_mnch_data_by_facility.csv',
                       input_path+'new_opd_ipd_data_by_facility.csv',
                       input_path+'/mal/new_mal_cases_by_facility.csv',
                       input_path+'/mal/new_mal_tested_by_facility.csv',
                       input_path+'/mal/new_mal_treated_by_facility.csv',
                       input_path+'/tb/new_tb_by_facility.csv',
                       input_path+'new_sam_data_identified_by_facility.csv',
                       input_path+'new_mam_data_by_facility.csv']

filepaths_api = input_path+"/hiv/HIV_newInstance.csv"

new_stack = fetch_new_data (filepaths=filepaths,
                            filepaths_no_dec=filepaths_no_dec,
                            filepaths_newlayout=filepaths_newlayout)

df_dis=new_stack[['organisationunitid','district']].drop_duplicates()

hiv_new_df=get_clean_stack_api(pd.read_csv(filepaths_api),df_dis)
new_stack=pd.concat([new_stack,hiv_new_df])

  if (await self.run_code(code, result,  async_=asy)):


In [10]:
new_stack['indic'].unique()

array(['EPI - BCG doses given', 'EPI - DPT 1 to DPT3 dropout rate (%)',
       'EPI - DPT-HepB-HIB 1 doses given',
       'EPI - DPT-HepB-HIB 1 doses_Under 1',
       'EPI - DPT-HepB-HIB 2 doses_Under 1',
       'EPI - DPT-HepB-HIB 3 doses given',
       'EPI - DPT-HepB-HIB 3 doses_Under 1',
       'EPI - DPT-HepB-Hib 1 coverage  < 1 year (%)',
       'EPI - DPT-HepB-Hib 2 Coverage < 1 year (%)',
       'EPI - DPT-HepB-Hib 3 coverage (%)', 'EPI - MR 1 coverage (%)',
       'EPI - MR 1 doses given', 'EPI - MR 2 coverage (%)',
       'EPI - MR 2 doses given', 'EPI - Measles (MR1) doses_Under 1',
       'EPI - Measles 2 (MR2) doses_1-4 yrs', 'EPI - PCV 1 doses_Under 1',
       'EPI - PCV 2 doses_Under 1', 'EPI - PCV 3 coverage < 1 year (%)',
       'EPI - PCV 3 doses_Under 1', 'EPI - PCV1 Coverage < 1 year (%)',
       '105-AN01a. ANC 1st Visit for women',
       '105-AN02. ANC 4th Visit for women', '105-MA01. Admissions',
       '105-MA04a. Deliveries in unit - Total',
       '105-MA04b1

## Old data

For now extremely messy, had to be done bit by bit in random order, so quite some cleaning needed Ill do here, than put all into one nice file

In [11]:
# to get all the old data files together
def fetch_old_data (filepaths,filepaths_newlayout):
    df = pd.DataFrame(columns = ['district', 'organisationunitid', 'year', 'indic', 'month', 'value'])
    for x in filepaths:
        x_df=get_clean_stack(pd.read_csv(x),drop=None)
        df=pd.concat([df,x_df])
    for x in filepaths_newlayout:
        x_df=get_clean_stack_newlayout(pd.read_csv(x),drop=None)
        df=pd.concat([df,x_df])
    return df

In [12]:

# creating the old data stack

# For now a bit all over the placedue to download limitations 

filepaths = [input_path_old+"/epi/EPI - BCG doses given.csv",
             input_path_old+"/epi/EPI - DPT-HepB-HIB 1 doses given.csv",
             input_path_old+"/epi/EPI - DPT-HepB-HIB 3 doses given.csv",
             input_path_old+"/epi/EPI - PCV 1 doses given.csv",
             input_path_old+"/epi/EPI - PCV 3 doses given.csv",
             input_path_old+"/epi/EPI - MR 1 doses given.csv",
             input_path_old+'/mat/admission_newborn.csv',
             input_path_old+'/mat/ANC1_ANC4.csv',
             input_path_old+'/mat/births.csv',
             input_path_old+'/sam/lbw.csv',
             input_path_old+'/sam/sam_mam.csv',
             input_path_old+'/sam/lbw_abs.csv',
             input_path_old+'/sam/vitamin.csv']

filepaths_newlayout = [input_path_old + '/reporting/old_reporting_by_facility.csv',
                       input_path_old + '/epi/EPI - TT doses given.csv',
                       input_path_old + '/epi/EPI - HPV doses given.csv',
                       input_path_old + 'ipd_opd/ipd_opd.csv',
                       input_path_old + 'hiv/old_hiv_general.csv',
                       input_path_old + 'mal/old_mal_data.csv',
                       input_path_old+'/tb/old_tb_data.csv']

filepaths_api =input_path_old + 'hiv/HIV_oldInstance.csv'

old_stack = fetch_old_data (filepaths=filepaths,filepaths_newlayout=filepaths_newlayout)

df_dis=old_stack[['organisationunitid','district']].drop_duplicates()
hiv_old_df=get_clean_stack_api(pd.read_csv(filepaths_api),df_dis)

old_stack=pd.concat([old_stack,hiv_old_df])

  if (await self.run_code(code, result,  async_=asy)):


In [13]:
old_stack['indic'].unique()

array(['EPI - BCG doses given', 'EPI - DPT-HepB-HIB 1 doses given',
       'EPI - DPT-HepB-HIB 3 doses given', 'EPI - PCV 1 doses given',
       'EPI - PCV 3 doses given', 'EPI - MR 1 doses given',
       '105-2.2 Admissions', '105-2.2 Newborn deaths (0-7days)',
       '105-2.3 Postnatal Attendances',
       '105-2.1 A1:ANC 1st Visit for women',
       '105-2.1 A2:ANC 4th Visit for women',
       '105-2.2a Deliveries in unit',
       '105-2.2b Deliveries in unit(Fresh Still births)',
       '105-2.2c Deliveries in unit(Macerated still births)',
       '105-2.2d Deliveries in unit(Live Births)',
       'Percentage of babies born with low birth weight (<2500 g)',
       'Underweight rate of children under 5',
       '105-1.3 OPD Moderate Acute Malnutrition (MAM)',
       '105-1.3 OPD Severe Acute Malnutrition With Oedema',
       '105-1.3 OPD Severe Acute Malnutrition Without Oedema',
       '105-2.2 Babies Born with low birth weight (<2.5Kgs)',
       '105-2.8 Vit A Suplement 1st Dose i

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,orgunitlevel1,orgunitlevel2,orgunitlevel3,orgunitlevel4,orgunitlevel5,organisationunitid,organisationunitname,organisationunitcode,organisationunitdescription,periodid,periodname,periodcode,perioddescription,HMIS 105:1 Actual reports,HMIS 105:2.1-2.7 Actual reports,HMIS 105:2.8-2.12 Actual reports,HMIS 105:3-4 Actual reports,HMIS 105:5-6 Actual reports,HMIS 105:7-8 Actual reports,HMIS 105:1 Expected reports,HMIS 105:2.1-2.7 Expected reports,HMIS 105:2.8-2.12 Expected reports,HMIS 105:3-4 Expected reports,HMIS 105:5-6 Expected reports,HMIS 105:7-8 Expected reports
0,MOH - Uganda,Northern Region,Zombo District,Akaa Subcounty,Ayaka HC II,XikHv88zzDn,Ayaka HC II,,,201801,Jan-18,201801,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,MOH - Uganda,Northern Region,Zombo District,Akaa Subcounty,Ayaka HC II,XikHv88zzDn,Ayaka HC II,,,201802,Feb-18,201802,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,MOH - Uganda,Northern Region,Zombo District,Akaa Subcounty,Ayaka HC II,XikHv88zzDn,Ayaka HC II,,,201803,Mar-18,201803,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,MOH - Uganda,Northern Region,Zombo District,Akaa Subcounty,Ayaka HC II,XikHv88zzDn,Ayaka HC II,,,201804,Apr-18,201804,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,MOH - Uganda,Northern Region,Zombo District,Akaa Subcounty,Ayaka HC II,XikHv88zzDn,Ayaka HC II,,,201805,May-18,201805,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172027,MOH - Uganda,Central Region,Kampala District,Nakawa Division,Luzira Staff Clinic HC IV,mDMIY5uLZzf,Luzira Staff Clinic HC IV,,,201908,Aug-19,201908,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
172028,MOH - Uganda,Central Region,Kampala District,Nakawa Division,Luzira Staff Clinic HC IV,mDMIY5uLZzf,Luzira Staff Clinic HC IV,,,201909,Sep-19,201909,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
172029,MOH - Uganda,Central Region,Kampala District,Nakawa Division,Luzira Staff Clinic HC IV,mDMIY5uLZzf,Luzira Staff Clinic HC IV,,,201910,Oct-19,201910,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
172030,MOH - Uganda,Central Region,Kampala District,Nakawa Division,Luzira Staff Clinic HC IV,mDMIY5uLZzf,Luzira Staff Clinic HC IV,,,201911,Nov-19,201911,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Population data

In [14]:
pop = pd.read_csv(pop_path+'UBOS_pop_proj_2015-25.csv')

def get_clean_pop_pivot(pop):
    pop['District']=pop['District'].apply(lambda x: x.upper())
    pop['District'].replace(district_name_dict,inplace=True)

    pop['age']= pop['Single Years'].apply(lambda x: ' '.join(x.split(' ')[:1]))
    pop['age'].replace({'80+':'80'},inplace=True)
    pop['age']=pop['age'].astype('int')
    
    pop.drop(['Single Years','Male','Female','Year2','FY'],axis=1,inplace=True)
    
    pop_pivot=pop.pivot_table(index=['District','Year'], columns=['age'])
    
    pop_pivot.columns = pop_pivot.columns.droplevel(0)
    
    pop_pivot.reset_index(inplace=True,drop=False)
    
    return pop_pivot

pop_pivot=get_clean_pop_pivot(pop)
pop_totals=pop_pivot.groupby('Year').sum()

In [15]:
def export_pop_data():
    from pandas import ExcelWriter
    with ExcelWriter(output_path+'static_data.xlsx',mode='a') as writer:
        pop_pivot.to_excel(writer,sheet_name='pop_age_year')
        pop_totals.to_excel(writer,sheet_name='pop_total_age_year')
#export_pop_data()

## Data transformation and initial cleaning to be able to link up old and new datasets

In [16]:
new_var_add_dict={'Babies Born with low birth weight (<2.5Kgs)':['105-MA04b2. Deliveries in unit -Live births - less than 2.5kg',
                                                                 '105-MA04c2. Deliveries in unit - Fresh still birth - less than 2.5kg',
                                                                 '105-MA04d2. Deliveries in unit - Macerated still birth - less than 2.5kg'],
                  
                  'td1':['EPI - Td1_Dose1_Child Bearing Age', 
                         'EPI - Td1_Dose1_Pregnant Women'],
                  
                  'td2':['EPI - Td2_Dose2_Child Bearing Age', 
                         'EPI - Td2_Dose2_Pregnant Women'],
                  
                  'td3':['EPI - Td3_Dose3_Child Bearing Age', 
                         'EPI - Td3_Dose3_Pregnant Women'],
                  
                  'td4_5':['EPI - Td4_Dose4_Child Bearing Age',
                           'EPI - Td4_Dose4_Pregnant Women',
                           'EPI - Td5_Dose5_Child Bearing Age',
                           'EPI - Td5_Dose5_Pregnant Women'],
                  
                  'OPD attendance':['105-OA01. New attendance',
                                    '105-OA02. Re-attendance'],
                  
                  'pregnant women tested for HIV in labor':['105-MA15a. Women tested for HIV in labour 1st time this Pregnancy - Total (TR & TRR)',
                                                            '105-MA16a. Women re-tested for HIV in labour - Total (TR & TRR)'],
                  
                  'pregnant women tested HIV+ve in labor':['105-MA15b. Women tested for HIV in labour 1st time this Pregnancy - HIV+ (TRR)',
                                                           '105-MA16b. Women re-tested for HIV in labour - HIV+ (TRR+)'],
                  
                  'malaria cases treated':['033B-MA06. Not tested cases treated',
                                           '033B-MA07. RDT Negative Cases Treated',
                                           '033B-MA08. RDT Positive Cases Treated',
                                           '033B-MA09. Microscopy Negative Cases Treated',
                                           '033B-MA10. Microscopy Positive Cases Treated'],
                  
                  'malaria tests':['033B-MA02. Cases Tested with RDT',
                                   '033B-MA04. Cases Tested with Microscopy'],
                  
                  'Number of doses of vitamin A distributed':['105-CH01. Vit A supplement (1st Dose)',
                                                              '105-CH02. Vit A supplement (2nd Dose)'],
                  
                  'Number of SAM cases identified':['105-NA03d1. Identified malnourished clients(<10) this month - SAM using W/H or L -  Without Oedema',
                                                    '105-NA03c1. Identified malnourished clients(<10) this month - SAM using MUAC -  Without Oedema',
                                                    '105-NA03e1. Identified malnourished clients(<10) this month - SAM With Oedema'],
                  
                  'Number of MAM cases identified':['105-NA03b1. Identified malnourished clients(<10) this month - MAM using W/H or L',
                                                    '105-NA03a1. Identified malnourished clients(<10) this month - MAM using MUAC']}

old_var_add_dict={'td4_5':['EPI - TT 4 doses given', 
                           'EPI - TT 5 doses given'],
                  
                  'OPD attendance':['105-1.1 OPD New Attendance', 
                                    '105-1.1 OPD Re-Attendance'],
                  
                  'pregnant women tested for HIV in labor':['105-2.2a Women tested for HIV in labour (1st time this Pregnancy)',
                                                            '105-2.2b Women tested for HIV in labour (Retest this Pregnancy)'],
                  
                  'pregnant women tested HIV+ve in labor':['105-2.2a Women testing HIV+ in labour (1st time this Pregnancy)',
                                                           '105-2.2b Women testing HIV+ in labour (Retest this Pregnancy)'],
                  
                  'breastfeeding mothers tested HIV +ve in PNC':['105-2.3a Breastfeeding mothers newly testing HIV+(1st test)',
                                                                 '105-2.3b Breastfeeding mothers newly testing HIV+(retest)'],
                  
                  "105-TP04. Total New and relapse TB cases registered in TB treatment unit":['105-1.3 OPD New TB cases (Bacteriologically confirmed)',
                                                                                              '105-1.3 OPD New TB cases (Clinically Diagnosed)',
                                                                                              '105-1.3 OPD New TB cases (EPTB)'],
                  
                  'Number of doses of vitamin A distributed':['105-2.8 Vit A Suplement 1st Dose in Year',
                                                              '105-2.8 Vit A Suplement 2nd Dose in theYear'],
                  
                  'Number of SAM cases identified':['105-1.3 OPD Severe Acute Malnutrition Without Oedema',
                                                    '105-1.3 OPD Severe Acute Malnutrition With Oedema']}


In [17]:
new_stack_add=new_stack.pivot_table(index=['district', 'organisationunitid', 'year','month'],columns='indic',values='value').copy()

for x in new_var_add_dict.keys():
    new_stack_add[x]=new_stack_add[new_var_add_dict[x]].sum(axis=1)

new_stack_add=new_stack_add[list(new_var_add_dict.keys())].stack().reset_index()
new_stack_add.rename(columns={0:'value'},inplace=True)
new_stack=pd.concat([new_stack,new_stack_add])

In [18]:
old_stack_add=old_stack.pivot_table(index=['district', 'organisationunitid', 'year','month'],columns='indic',values='value').copy()

for x in old_var_add_dict.keys():
    old_stack_add[x]=old_stack_add[old_var_add_dict[x]].sum(axis=1)

old_stack_add=old_stack_add[list(old_var_add_dict.keys())].stack().reset_index()
old_stack_add.rename(columns={0:'value'},inplace=True)
old_stack=pd.concat([old_stack,old_stack_add])

In [19]:
#Making a dict with new vars 

new_add_metrics = pd.DataFrame(new_var_add_dict.values(),new_var_add_dict.keys())
new_add_metrics['system'] = 'new'
old_add_metrics = pd.DataFrame(old_var_add_dict.values(),old_var_add_dict.keys())
old_add_metrics['system'] = 'old'
add_metrics = pd.concat([new_add_metrics,old_add_metrics])
add_metrics.rename(columns={0:'1st var',1:'2nd var',2:'3rd var',3:'4th var',4:'5th var'},inplace=True)

In [21]:
def export_var_added():
    from pandas import ExcelWriter
    with ExcelWriter('../../data/input/dhis2/new_old_correspondance.xlsx',mode='a') as writer:
        add_metrics.to_excel(writer,sheet_name='added_var')

export_var_added()

### Clean a data issue in the old stack regarding reporting rates

In [22]:
old_stack['value'] = np.where(old_stack['indic'].isin(['HMIS 105:1 Actual reports','HMIS 105:1 Expected reports'])
                              & (old_stack['value']==4),1,old_stack['value'])

# Merge the two

## Select a subset of indicators to merge on

This is done in Excel manually, after running the small function below

In [23]:
def export_var_names():
    from pandas import ExcelWriter
    with ExcelWriter('../../data/input/dhis2/new_old_correspondance.xlsx',mode='a') as writer:
        pd.Series(old_stack['indic'].unique()).to_excel(writer,sheet_name='old_vars')
        pd.Series(new_stack['indic'].unique()).to_excel(writer,sheet_name='new_vars')

export_var_names()

Here I get back the result into a dict

In [24]:
def replace_var_names():
    from pandas import ExcelFile
    xls = ExcelFile('../../data/input/dhis2/new_old_correspondance.xlsx')
    df = xls.parse(xls.sheet_names[0])
    df.set_index('Old',drop=True,inplace=True)
    old_new_dict=df['New'].to_dict()
    old_stack.replace({'indic': old_new_dict},inplace=True) # Replacing the old names by the new
    target_indics=list(old_new_dict.values()) # Store my target indicators
    return target_indics

In [25]:
target_indics=replace_var_names()

In [26]:
target_indics

['105-AN01a. ANC 1st Visit for women',
 '105-AN02. ANC 4th Visit for women',
 '105-MA01. Admissions',
 '105-MA04a. Deliveries in unit - Total',
 '105-MA04b1. Deliveries in unit -Live births - Total',
 '105-MA04c1. Deliveries in unit - Fresh still birth - Total',
 '105-MA04d1. Deliveries in unit - Macerated still birth - Total',
 '105-MA11. Newborn deaths (0-7 days)',
 '105-PN01a. Post Natal Attendances - Mother',
 'EPI - BCG doses given',
 'EPI - DPT-HepB-HIB 1 doses given',
 'EPI - DPT-HepB-HIB 3 doses given',
 'EPI - MR 1 doses given',
 'EPI - PCV 1 doses_Under 1',
 'EPI - PCV 3 doses_Under 1',
 '105-CH01. Vit A supplement (1st Dose)',
 '105-CH02. Vit A supplement (2nd Dose)',
 'Babies Born with low birth weight (<2.5Kgs)',
 'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Actual reports 1. National',
 'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Expected reports 1. National',
 'EPI - HPV1 doses given',
 'EPI -

### Check there is no issue with facility ids

In [27]:
# Store the valid ids

old_ids=set(old_stack['organisationunitid'].unique())
new_ids=set(new_stack['organisationunitid'].unique())
valid_ids=list(old_ids.intersection(new_ids))


### Get only the bits of data I am interested in

In [28]:
old_stack_t=old_stack[old_stack['indic'].isin(target_indics) & old_stack['organisationunitid'].isin(valid_ids)].copy()
new_stack_t=new_stack[new_stack['indic'].isin(target_indics) & new_stack['organisationunitid'].isin(valid_ids)].copy()

In [29]:
stack_t=pd.concat([old_stack_t,new_stack_t])
stack_t.reset_index(drop=True,inplace=True)
stack_t.head()

Unnamed: 0,district,organisationunitid,year,indic,month,value
0,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Apr,7.0
1,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Aug,7.0
2,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Dec,5.0
3,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Feb,
4,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Jan,


In [30]:
stack_t['indic'].unique()

array(['EPI - BCG doses given', 'EPI - DPT-HepB-HIB 1 doses given',
       'EPI - DPT-HepB-HIB 3 doses given', 'EPI - PCV 1 doses_Under 1',
       'EPI - PCV 3 doses_Under 1', 'EPI - MR 1 doses given',
       '105-MA01. Admissions', '105-MA11. Newborn deaths (0-7 days)',
       '105-PN01a. Post Natal Attendances - Mother',
       '105-AN01a. ANC 1st Visit for women',
       '105-AN02. ANC 4th Visit for women',
       '105-MA04a. Deliveries in unit - Total',
       '105-MA04c1. Deliveries in unit - Fresh still birth - Total',
       '105-MA04d1. Deliveries in unit - Macerated still birth - Total',
       '105-MA04b1. Deliveries in unit -Live births - Total',
       'Number of MAM cases identified',
       'Babies Born with low birth weight (<2.5Kgs)',
       '105-CH01. Vit A supplement (1st Dose)',
       '105-CH02. Vit A supplement (2nd Dose)',
       'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Actual reports 1. National',
       'HMIS 105:01 - O

### Split reporting vars from others

In [31]:
report_indics=['HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Actual reports 1. National',
               'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Expected reports 1. National']

In [32]:
stack_t_noreport=stack_t[~stack_t['indic'].isin(report_indics)].copy()
stack_t_report=stack_t[stack_t['indic'].isin(report_indics)].copy()

# Flag outliers

## Put our data in the right format 


In [33]:
%%time

def pivot_stack(df):
    pivot_outliers=df.copy().pivot_table(index=['district', 'organisationunitid', 'indic'], columns=['year','month' ]) #,dropna=False)
    pivot_outliers.rename(columns={'value':'with_outiers'},level=0,inplace=True)
    pivot_outliers.columns.rename('type', level=0, inplace=True)
    pivot_outliers.dropna(how='all',axis=0,inplace=True) # looks like there is no all na line to drop
    return pivot_outliers

pivot_outliers=pivot_stack(stack_t_noreport)

Wall time: 22.9 s


## Replace outliers using a std deviation method

In [34]:
# Get the data in the right format

def replace_outliers(pivot_outliers,cutoff):#df
    
    pivot_no_outliers=pd.DataFrame(columns=pivot_outliers.columns,index=pivot_outliers.index)
    pivot_no_outliers.rename(columns={'with_outiers':'without_outliers'},level=0,inplace=True)
    
    for x in pivot_outliers.index: # to exclude
        values = pivot_outliers.loc[x,:].values
        if np.nanstd(values)!=0 and np.isnan(values).sum()!=len(values):
            zscore = abs(stats.zscore(values,nan_policy='omit'))
            new_values = np.where(zscore>cutoff,np.nanmedian(values),values)

        else:
            new_values = values

        pivot_no_outliers.iloc[pivot_outliers.index.get_loc(x),:] = new_values.astype('float')

    return pivot_no_outliers 


In [35]:
%%time
pivot_no_outliers = replace_outliers(pivot_outliers,cutoff=3)


  new_values = np.where(zscore>cutoff,np.nanmedian(values),values)


Wall time: 4min 58s


In [36]:
# Get the data in the right format

def replace_outliers_iqr(pivot_outliers,k):#df
    
    pivot_no_outliers=pd.DataFrame(columns=pivot_outliers.columns,index=pivot_outliers.index)
    pivot_no_outliers.rename(columns={'with_outiers':'without_outliers'},level=0,inplace=True)
    
    for x in pivot_outliers.index:
        values = pivot_outliers.loc[x,:].values
        if np.nanstd(values)!=0 and np.isnan(values).sum()!=len(values):
            Q1 = np.nanquantile(values,0.25)
            Q3 = np.nanquantile(values,0.75)
            IQR = Q3 - Q1
            LB = Q1 - k*IQR
            UB = Q3 + k*IQR
            new_values = np.where((values<LB)|(values>UB),np.nanmedian(values),values)

        else:
            new_values = values

        pivot_no_outliers.iloc[pivot_outliers.index.get_loc(x),:] = new_values.astype('float')

    return pivot_no_outliers 

In [37]:
%%time
pivot_no_outliers_iqr = replace_outliers_iqr(pivot_outliers,k=3)

  new_values = np.where((values<LB)|(values>UB),np.nanmedian(values),values)
  new_values = np.where((values<LB)|(values>UB),np.nanmedian(values),values)


Wall time: 5min 51s


## Stack the outlier corrected data

In [38]:
def pivot_stack(pivot):
    stack = pivot.stack(level=[0,1,2],dropna=False).reset_index()
    stack.rename(columns={0:'value'},inplace=True)
    stack.drop('type',axis=1,inplace=True)
    stack['value']=stack['value'].astype(dtype='float64')
    return stack

In [39]:
stack_t_noout=pivot_stack(pivot_no_outliers)
stack_t_noout_iqr=pivot_stack(pivot_no_outliers_iqr)


## Record which data points were changed

In [40]:
stack_compare = pd.merge(stack_t,stack_t_noout,how='inner',
                           left_on=['district', 'organisationunitid', 'year', 'indic', 'month'],
                           right_on=['district', 'organisationunitid', 'year', 'indic', 'month']).rename(columns={'value_x':'value_out','value_y':'value_noout'})
stack_compare.dropna(subset=['value_out','value_noout'],inplace=True)
stack_compare['changed']=np.where((stack_compare['value_out'] != stack_compare['value_noout']),True,False)
changed = stack_compare[stack_compare['changed']==True]

In [41]:
stack_compare_iqr = pd.merge(stack_t,stack_t_noout_iqr,how='inner',
                           left_on=['district', 'organisationunitid', 'year', 'indic', 'month'],
                           right_on=['district', 'organisationunitid', 'year', 'indic', 'month']).rename(columns={'value_x':'value_out','value_y':'value_noout'})
stack_compare_iqr.dropna(subset=['value_out','value_noout'],inplace=True)
stack_compare_iqr['changed']=np.where((stack_compare_iqr['value_out'] != stack_compare_iqr['value_noout']),True,False)
changed_iqr = stack_compare_iqr[stack_compare_iqr['changed']==True]

In [42]:
changed_iqr

Unnamed: 0,district,organisationunitid,year,indic,month,value_out,value_noout,changed
41,LUWERO,T4M9UgfqV5q,2019,EPI - BCG doses given,Jul,44.0,22.0,True
394,PADER,auzuV39xOTU,2018,EPI - BCG doses given,Oct,58.0,12.0,True
831,AMURIA,o7CJeTwDapk,2019,EPI - BCG doses given,Feb,20.0,4.0,True
928,MOYO,ZubUtSX9erU,2019,EPI - BCG doses given,Jan,6.0,2.0,True
1010,ABIM,ltIECAx2ppI,2018,EPI - BCG doses given,Dec,57.0,7.0,True
...,...,...,...,...,...,...,...,...
5569226,ZOMBO,aoc8dFcHiqR,2020,Number of MAM cases identified,May,17.0,1.5,True
5569243,ZOMBO,apEUhKfLxjY,2020,td3,Feb,21.0,1.0,True
5569285,ZOMBO,apEUhKfLxjY,2020,pregnant women tested for HIV in labor,May,5.0,0.0,True
5569292,ZOMBO,eZC9Ddr4KIy,2020,Babies Born with low birth weight (<2.5Kgs),Apr,7.0,1.0,True


In [43]:
changed.to_csv(output_path+'outliers_list.csv')
changed_iqr.to_csv(output_path+'outliers_list_iqr.csv')

# Export this to Tableau

In [44]:
# to check any data point below:

#stack_t[(stack_t['organisationunitid']=='JO1cLIghdBv') & 
        #(stack_t['year']=='2018') & 
        #(stack_t['indic']=='105-NA03e1. Identified malnourished clients(<10) this month - SAM With Oedema') & 
        #(stack_t['month']=='Apr')]['value'].notna()

In [45]:
#Put together my DHIS data with and without outlier

fac_stack_int=pd.DataFrame()
fac_stack_int = pd.merge(stack_t_noreport,stack_t_noout,how='left',
                           left_on=['district', 'organisationunitid', 'year', 'indic', 'month'],
                           right_on=['district', 'organisationunitid', 'year', 'indic', 'month']).rename(columns={'value_x':'value_out','value_y':'value_noout'})

fac_stack_int = pd.merge(fac_stack_int,stack_t_noout_iqr,how='left',
                           left_on=['district', 'organisationunitid', 'year', 'indic', 'month'],
                           right_on=['district', 'organisationunitid', 'year', 'indic', 'month']).rename(columns={'value':'value_noout_iqr'})

# Make a note of whcih facilities reported which didt 

fac_stack_int['reported'] = (fac_stack_int['value_out']>0).astype('int')

# Add in the reporting rate data, that did not go through theoutlier precocedure

stack_t_report.rename(columns={'value':'reported'},inplace=True)
stack_t_report.set_index(['district','organisationunitid','year' ,'indic','month'],inplace=True,drop=True)
stack_t_report.reset_index(inplace=True)

#Puts it all together 

fac_stack_final=pd.concat([stack_t_report,fac_stack_int],ignore_index=True)
fac_stack_final=pd.melt(fac_stack_final, id_vars=['district','organisationunitid','year','indic','month'], value_vars=['reported','value_out','value_noout','value_noout_iqr'])
fac_stack_final.rename(columns={'variable':'dataset'},inplace=True)

# Create a pivot

fac_pivot_final=fac_stack_final.pivot_table(index=['district','organisationunitid','year','month','dataset'], columns=['indic'],aggfunc='mean')
fac_pivot_final.columns=fac_pivot_final.columns.droplevel(0)

In [51]:
pivot_export=fac_pivot_final.copy()
pivot_export.to_csv(output_path+'corrected_data_facility.csv')

# Export the code for our framework

In [118]:
var_name_dict={'district':'id', 
               'date':'date',
               'year':'year',
               'month':'month',
               'organisationunitid':'facility_id', 
               'dataset':'type',
               'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Expected reports 1. National': 'Expected 105:1 reporting',
               'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Actual reports 1. National' : 'Actual 105:1 reporting',
               '105-AN01a. ANC 1st Visit for women':'1st ANC Visits',
               '105-AN02. ANC 4th Visit for women':'4th ANC Visits',
               '105-MA01. Admissions' : 'Maternity Admissions',
               '105-MA04a. Deliveries in unit - Total': 'Deliveries in unit',
               '105-MA04b1. Deliveries in unit -Live births - Total': 'Deliveries in unit - live',
               '105-MA04c1. Deliveries in unit - Fresh still birth - Total': 'Deliveries in unit - fresh stillbirth',
               '105-MA04d1. Deliveries in unit - Macerated still birth - Total': 'Deliveries in unit - macerated stillbirth',
               '105-MA11. Newborn deaths (0-7 days)':'Newborn deaths',
               '105-PN01a. Post Natal Attendances - Mother':'Postnatal Visits',
               'Babies Born with low birth weight (<2.5Kgs)' : 'Low weigh births', 
               'EPI - BCG doses given' : 'BCG',
               'EPI - DPT-HepB-HIB 1 doses given':'DPT1', 
               'EPI - DPT-HepB-HIB 3 doses given':'DPT3',
               'EPI - HPV1 doses given':'HPV1', 
               'EPI - HPV2 doses given':'HPV2',
               'EPI - MR 1 doses given':'MR1', 
               'EPI - PCV 1 doses_Under 1':'PCV1',
               'EPI - PCV 3 doses_Under 1':'PCV3',
               'td1':'TD1', 
               'td2':'TD2', 
               'td3':'TD3', 
               'td4_5':'TD4-5',
               '033B-CD01a. Malaria (diagnosed)  - Cases':'Malaria cases',
               'OPD attendance':'OPD attendance',
               'malaria cases treated': 'Malaria cases treated',
               'malaria tests':'Malaria tests',
               'pregnant women tested HIV+ve in labor':'Mat tested HIV positive',
               'pregnant women tested for HIV in labor':'Mat tested HIV',
               '033B-CD01b. Malaria (diagnosed) - Deaths':'Malaria deaths',
               '105-AN29a. Pregnant women newly tested for HIV in this pregnancy at any ANC visit (TR & TRR) - total':'ANC tested HIV',
               '105-AN30a. Pregnant Women tested HIV+ for 1st time this pregnancy (TRR) at any ANC Visit - Total':'ANC tested HIV positive',
               '105-AN32. HIV+ pregnant women initiated on ART for eMTCT at any visit irrespective of when tested HIV+ (TRR, TRR+,TRR‚àö)':'ANC initiated on ART',
               '105-HT03a1. Total Tested for HIV':'Tested HIV', 
               '105-HT03a2. Total New HIV+':'Tested HIV positive',
               '105-HT03a3. Total Linked to HIV Care':'HIV positive linked to care',
               '105-MA17. Women initiating ART in maternity - HIV+':'Mat initiated on ART',
               '105-PN03a. Breastfeedng mothers tested for HIV 1st time during Postnatal - Total (TR & TRR)':'PNC tested HIV',
               '105-PN03b. Breastfeedng mothers tested for HIV 1st time during Postnatal - TRR':'PNC tested HIV positive',
               '105-PN05a. HIV+ women initiating ART in Postnatal - Total':'PNC initiated on ART',
               '108-CI02. No. of admissions':'IPD attendance',
               '105-TP04. Total New and relapse TB cases registered in TB treatment unit':'TB cases registered',
               'Number of doses of vitamin A distributed':'Number of doses of vitamin A distributed',
               'Number of SAM cases identified':'Number of SAM cases identified',
               'Number of MAM cases identified':'Number of MAM cases identified'}

In [150]:
# 2. DATA PROCESSING: Clean the data into desired formats

data=pivot_export.reset_index(level=[0,1,2,3,4],col_level=1)

def parse_date(date):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    year, month = date.split('-')
    month = months.index(month)+1
    return datetime.date(year=int(year), month=month, day=1)

def get_report_types(e,a,i):
    #e = float(e)
    #a = float(a)
    #i = float(i)
    if i==1:
        y='positive_indic'
    elif e==1 and a==1 and i!=1:
        y='no_positive_indic'
    elif e==1 and a!=1 and i!=1:
        y='no_form_report'
    elif e!=1:
        y='not_expected'
    return y

def add_report_columns(data):
    for x in list(data.columns[8:]):
        data[str(x)+'_rpt']=np.vectorize(get_report_types)(data['Expected 105:1 reporting'],data['Actual 105:1 reporting'],data[x])
        data.drop(x,axis=1,inplace=True)
        data.rename(columns={str(x)+'_rpt':str(x)},inplace=True)
    return data

def clean_breakdown_data (data):
    
    # Combine the date into one column and format
    
    data['date'] = data['year'].astype(str) + '-' + data['month']
    data['date'] = data.date.apply(parse_date)
    data['date'] = pd.to_datetime(data.date)
    
    # Select and rename the metric columns
    
    data = data[list(var_name_dict.keys())]
    data.columns = list(var_name_dict.values())
    
    # Breakdown in several dfs
    
    reporting = data[data['type']=='reported'].copy()
    with_outliers = data[data['type']=='value_out'].copy()
    no_outliers_std = data[data['type']=='value_noout'].copy()
    no_outliers_iqr = data[data['type']=='value_noout_iqr'].copy()
    
    # Add the reporting status columns
    
    reporting_add = add_report_columns(reporting)

    return (reporting_add, with_outliers, no_outliers_std, no_outliers_iqr)

(facility_data_reporting, facility_data_with_outliers, facility_data_no_outliers_std, facility_data_no_outliers_iqr) = clean_breakdown_data(data)

In [164]:
facility_data_reporting.to_csv(output_path+'facility_data_reporting.csv')
facility_data_with_outliers.to_csv(output_path+'facility_data_with_outliers.csv')
facility_data_no_outliers_std.to_csv(output_path+'facility_data_no_outliers_std.csv')
facility_data_no_outliers_iqr.to_csv(output_path+'facility_data_no_outliers_iqr.csv')