In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import timeit
import scipy
import datetime

from scipy import stats
from pandas import ExcelWriter

pd.set_option('display.max_columns', None)
idx = pd.IndexSlice

In [2]:
input_path = '../../data/input/dhis2/new_system/'
input_path_old = '../../data/input/dhis2/old_system/'

shapes_path = '../../data/shapes/district/districts_17_19_clean.shp'
facility_path = '../../data/input/hospitals/original_data/'
pop_path = '../../data/input/demographics/'

output_path = '../../data/output/sprint3_analysis/'

# Fetch my data


## Set up

In [3]:
# build a small fucntion to split the string column name of the data download as pivot 

def split(strng, sep, occ):
    strng = strng.split(sep)
    return sep.join(strng[occ[1]:]), sep.join(strng[:occ[0]]), sep.join(strng[occ[0]:occ[1]])[:3]

In [4]:
# Creating a dict of names to replace district names

district_name_dict = {'SEMBABULE': 'SSEMBABULE', 'MADI-OKOLLO': 'MADI OKOLLO', 'LUWEERO':'LUWERO'}

# For a fully automated one, will need to do fuzzy matching


In [5]:
# To clean the data downloaded in a pivot format

def get_clean_stack(df,drop):

    df['district']=df['orgunitlevel3'].apply(lambda x: x[:-9].upper())
    df['district'].replace(district_name_dict,inplace=True)
    
    df.set_index(['district','organisationunitid'],drop=True,inplace=True)
    
    cols = np.arange(0,9)
    df.drop(df.columns[cols],axis=1,inplace=True)
    
    cols = df.columns
    new_cols=[]
    for col in cols:
        new_cols.append(split(col,' ',[-2,-1]))
    df.columns=pd.MultiIndex.from_tuples(new_cols,names=['year','indic','month'])
    
    if drop != None:
        df.drop(drop,axis=1,inplace=True,level=2)
    
    df1=df.copy().stack(level=[0,1,2],dropna=False).reset_index()
    df1.rename(columns={0:'value'},inplace=True)
    
    return df1

In [6]:
# To clean the data downloaded in a slightly different pivot format

def get_clean_stack_newlayout(df,drop):

    month_dict={'01':'Jan','02':'Feb','03':'Mar','04':'Apr',
                '05':'May','06':'Jun','07':'Jul','08':'Aug',
                '09':'Sep','10':'Oct','11':'Nov','12':'Dec'}
    
    df['district']=df['orgunitlevel3'].apply(lambda x: x[:-9].upper())
    df['district'].replace(district_name_dict,inplace=True)

    df['year']=df['periodcode'].astype('str').apply(lambda x: x[:4])
    df['month']=df['periodcode'].astype('str').apply(lambda x: x[-2:]).replace(month_dict)
    
    if drop != None:
        if type(drop)==str:
            df1=df[~(df['month']==drop)].copy()
        if type(drop)==list:
            df1=df[~df['month'].isin(drop)].copy()
    else:
        df1=df.copy()
        
    df1.set_index(['district','organisationunitid','year','month'],drop=True,inplace=True)
        
    cols = np.arange(0,12)
    df1.drop(df1.columns[cols],axis=1,inplace=True)
    
    df2=df1.copy().stack(dropna=False).reset_index()
    df2.rename(columns={0:'value','level_4':'indic'},inplace=True)
    
    return df2

## New data


In [7]:
# to get all the new data files together
def fetch_new_data (filepaths,filepaths_no_dec,filepaths_newlayout):
    df = pd.DataFrame(columns = ['district', 'organisationunitid', 'year', 'indic', 'month', 'value'])
    for x in filepaths:
        x_df=get_clean_stack(pd.read_csv(x),drop='Dec')
        df=pd.concat([df,x_df])
    for x in filepaths_no_dec:
        x_df=get_clean_stack(pd.read_csv(x),drop=None)
        df=pd.concat([df,x_df])
    for x in filepaths_newlayout:
        x_df=get_clean_stack_newlayout(pd.read_csv(x),drop='Dec')
        df=pd.concat([df,x_df])
    return df

In [8]:
# creating the new data stack

filepaths = [input_path+'new_epi_data_by_facility.csv',
            input_path+'new_mnch_data_by_facility.csv',
            input_path+'new_sam_data_by_facility.csv',
            input_path+'new_lbw_data_by_facility.csv',
            input_path+'new_vitamin_data_by_facility.csv']

filepaths_no_dec = [input_path+'may_new_sam_data_by_facility.csv',
                    input_path+'may_new_epi_data_by_facility.csv']

filepaths_newlayout = [input_path+'new_reporting_by_facility.csv',
                      input_path+'new_epi_data_addendum_by_facility.csv',
                      input_path+'may_new_mnch_data_by_facility.csv',
                      input_path+'new_opd_ipd_data_by_facility.csv',
                      input_path+'/HIV/new_general_hiv.csv']

new_stack = fetch_new_data (filepaths=filepaths,filepaths_no_dec=filepaths_no_dec,filepaths_newlayout=filepaths_newlayout)

  if (await self.run_code(code, result,  async_=asy)):


## Old data

For now extremely messy, had to be done bit by bit in random order, so quite some cleaning needed Ill do here, than put all into one nice file

In [9]:
# to get all the old data files together
def fetch_old_data (filepaths,filepaths_newlayout):
    df = pd.DataFrame(columns = ['district', 'organisationunitid', 'year', 'indic', 'month', 'value'])
    for x in filepaths:
        x_df=get_clean_stack(pd.read_csv(x),drop=None)
        df=pd.concat([df,x_df])
    for x in filepaths_newlayout:
        x_df=get_clean_stack_newlayout(pd.read_csv(x),drop=None)
        df=pd.concat([df,x_df])    
    return df

In [11]:

# creating the old data stack

# For now a bit all over the placedue to download limitations 

filepaths = [input_path_old+"/epi/EPI - BCG doses given.csv",
             input_path_old+"/epi/EPI - DPT-HepB-HIB 1 doses given.csv",
             input_path_old+"/epi/EPI - DPT-HepB-HIB 3 doses given.csv",
             input_path_old+"/epi/EPI - PCV 1 doses given.csv",
             input_path_old+"/epi/EPI - PCV 3 doses given.csv",
             input_path_old+"/epi/EPI - MR 1 doses given.csv",
             input_path_old+'/mat/admission_newborn.csv',
             input_path_old+'/mat/ANC1_ANC4.csv',
             input_path_old+'/mat/births.csv',
             input_path_old+'/sam/lbw.csv',
             input_path_old+'/sam/sam_mam.csv',
             input_path_old+'/sam/lbw_abs.csv',
             input_path_old+'/sam/vitamin.csv']

filepaths_newlayout = [input_path_old + '/reporting/old_reporting_by_facility.csv',
                      input_path_old + '/epi/EPI - TT doses given.csv',
                      input_path_old + '/epi/EPI - HPV doses given.csv',
                      input_path_old + 'ipd_opd/ipd_opd.csv',
                      input_path_old + 'hiv/old_hiv_general.csv']

old_stack = fetch_old_data (filepaths=filepaths,filepaths_newlayout=filepaths_newlayout)

## Population data

In [12]:
pop = pd.read_csv(pop_path+'UBOS_pop_proj_2015-25.csv')

def get_clean_pop_pivot(pop):
    pop['District']=pop['District'].apply(lambda x: x.upper())
    pop['District'].replace(district_name_dict,inplace=True)

    pop['age']= pop['Single Years'].apply(lambda x: ' '.join(x.split(' ')[:1]))
    pop['age'].replace({'80+':'80'},inplace=True)
    pop['age']=pop['age'].astype('int')
    
    pop.drop(['Single Years','Male','Female','Year2','FY'],axis=1,inplace=True)
    
    pop_pivot=pop.pivot_table(index=['District','Year'], columns=['age'])
    
    pop_pivot.columns = pop_pivot.columns.droplevel(0)
    
    pop_pivot.reset_index(inplace=True,drop=False)
    
    return pop_pivot

pop_pivot=get_clean_pop_pivot(pop)
pop_totals=pop_pivot.groupby('Year').sum()

In [13]:
def export_pop_data():
    from pandas import ExcelWriter
    with ExcelWriter(output_path+'static_data.xlsx',mode='a') as writer:
        pop_pivot.to_excel(writer,sheet_name='pop_age_year')
        pop_totals.to_excel(writer,sheet_name='pop_total_age_year')
#export_pop_data()

## Data transformation and initial cleaning to be able to link up old and new datasets

### Create a new variable aggregating lbw indicators for the new database

In [14]:
new_lbw_indics=['105-MA04b2. Deliveries in unit -Live births - less than 2.5kg',
                '105-MA04c2. Deliveries in unit - Fresh still birth - less than 2.5kg',
                '105-MA04d2. Deliveries in unit - Macerated still birth - less than 2.5kg']

In [15]:
new_lbw=new_stack.pivot_table(index=['district', 'organisationunitid', 'year','month'],columns='indic',values='value')[new_lbw_indics].copy()
new_lbw['Babies Born with low birth weight (<2.5Kgs)'] = new_lbw[new_lbw_indics].sum(axis=1)
new_lbw_stack=new_lbw['Babies Born with low birth weight (<2.5Kgs)'].reset_index()
new_lbw_stack.rename(columns={'Babies Born with low birth weight (<2.5Kgs)':'value'},inplace=True)
new_lbw_stack['indic']='Babies Born with low birth weight (<2.5Kgs)'
new_stack=pd.concat([new_stack,new_lbw_stack])

### Clean a data issue in the old stack regarding reporting rates

In [16]:
old_stack['value'] = np.where(old_stack['indic'].isin(['HMIS 105:1 Actual reports','HMIS 105:1 Expected reports'])
                              & (old_stack['value']==4),1,old_stack['value'])

### Create a new variable aggregating MUAC and W/L SAM and MAM indicators

WIP

### Clean the TD doses and create new aggregate variables to link up old and new datasets

In [17]:
new_td_indics=['EPI - Td1_Dose1_Child Bearing Age', 'EPI - Td1_Dose1_Pregnant Women',
               'EPI - Td2_Dose2_Child Bearing Age', 'EPI - Td2_Dose2_Pregnant Women',
               'EPI - Td3_Dose3_Child Bearing Age', 'EPI - Td3_Dose3_Pregnant Women',
               'EPI - Td4_Dose4_Child Bearing Age', 'EPI - Td4_Dose4_Pregnant Women',
               'EPI - Td5_Dose5_Child Bearing Age', 'EPI - Td5_Dose5_Pregnant Women']

new_td=new_stack.pivot_table(index=['district', 'organisationunitid', 'year','month'],columns='indic',values='value')[new_td_indics].copy()

In [18]:
new_td['td1'] = new_td[new_td_indics[0:2]].sum(axis=1)
new_td['td2'] = new_td[new_td_indics[2:4]].sum(axis=1)
new_td['td3'] = new_td[new_td_indics[4:6]].sum(axis=1)
new_td['td4_5'] = new_td[new_td_indics[6:10]].sum(axis=1)

In [19]:
new_td_stack=new_td[['td1','td2','td3','td4_5']].stack().reset_index()
new_td_stack.rename(columns={0:'value'},inplace=True)
new_stack=pd.concat([new_stack,new_td_stack])

In [20]:
old_td_indics=['EPI - TT 1 doses given','EPI - TT 1 doses given Pregnant',
               'EPI - TT 2 doses given','EPI - TT 2 doses given Pregnant', 
               'EPI - TT 3 doses given','EPI - TT 3 doses given Pregnant',
               'EPI - TT 4 doses given','EPI - TT 4 doses given Pregnant',
               'EPI - TT 5 doses given','EPI - TT 5 doses given Pregnant']

old_td=old_stack.pivot_table(index=['district', 'organisationunitid', 'year','month'],columns='indic',values='value')[old_td_indics].copy()

In [21]:
old_td['td4_5'] = old_td[[old_td_indics[6],old_td_indics[8]]].sum(axis=1)
old_td['indic'] = 'td4_5'

In [22]:
old_td_stack=old_td[['td4_5','indic']].reset_index()
old_td_stack.rename(columns={'td4_5':'value'},inplace=True)
old_stack=pd.concat([old_stack,old_td_stack])

### Create new aggregate for OPD 

In [23]:
new_opd_indics=['105-OA01. New attendance','105-OA02. Re-attendance']
old_opd_indics=['105-1.1 OPD New Attendance', '105-1.1 OPD Re-Attendance']

In [24]:
new_opd=new_stack.pivot_table(index=['district', 'organisationunitid', 'year','month'],columns='indic',values='value')[new_opd_indics].copy()
old_opd=old_stack.pivot_table(index=['district', 'organisationunitid', 'year','month'],columns='indic',values='value')[old_opd_indics].copy()

In [25]:
new_opd['OPD attendance'] = new_opd[new_opd_indics].sum(axis=1)
old_opd['OPD attendance'] = old_opd[old_opd_indics].sum(axis=1)

In [26]:
new_opd_stack=new_opd['OPD attendance'].reset_index()
new_opd_stack.rename(columns={'OPD attendance':'value'},inplace=True)
new_opd_stack['indic']='OPD attendance'

old_opd_stack=old_opd['OPD attendance'].reset_index()
old_opd_stack.rename(columns={'OPD attendance':'value'},inplace=True)
old_opd_stack['indic']='OPD attendance'

In [27]:
new_stack=pd.concat([new_stack,new_opd_stack])
old_stack=pd.concat([old_stack,old_opd_stack])

In [28]:
old_stack

Unnamed: 0,district,organisationunitid,year,indic,month,value
0,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Apr,7.0
1,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Aug,7.0
2,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Dec,5.0
3,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Feb,
4,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Jan,
...,...,...,...,...,...,...
125591,ZOMBO,vFo5oNkc7Nn,2019,OPD attendance,Mar,955.0
125592,ZOMBO,vFo5oNkc7Nn,2019,OPD attendance,May,1004.0
125593,ZOMBO,vFo5oNkc7Nn,2019,OPD attendance,Nov,934.0
125594,ZOMBO,vFo5oNkc7Nn,2019,OPD attendance,Oct,747.0


# Merge the two

## Select a subset of indicators to merge on

This is done in Excel manually, after running the small function below

In [29]:
def export_var_names():
    from pandas import ExcelWriter
    with ExcelWriter('../../data/input/dhis2/new_old_correspondance.xlsx',mode='a') as writer:
        pd.Series(old_stack['indic'].unique()).to_excel(writer,sheet_name='old_vars')
        pd.Series(new_stack['indic'].unique()).to_excel(writer,sheet_name='new_vars')

#export_var_names()

Here I get back the result into a dict

In [30]:
def replace_var_names():
    from pandas import ExcelFile
    xls = ExcelFile('../../data/input/dhis2/new_old_correspondance.xlsx')
    df = xls.parse(xls.sheet_names[0])
    df.set_index('Old',drop=True,inplace=True)
    old_new_dict=df['New'].to_dict()
    old_stack.replace({'indic': old_new_dict},inplace=True) # Replacing the old names by the new
    target_indics=list(old_new_dict.values()) # Store my target indicators
    return target_indics

In [31]:
target_indics=replace_var_names()

### Check there is no issue with facility ids

In [32]:
# Store the valid ids

old_ids=set(old_stack['organisationunitid'].unique())
new_ids=set(new_stack['organisationunitid'].unique())
valid_ids=list(old_ids.intersection(new_ids))


### Get only the bits of data I am interested in

In [33]:
old_stack_t=old_stack[old_stack['indic'].isin(target_indics) & old_stack['organisationunitid'].isin(valid_ids)].copy()
new_stack_t=new_stack[new_stack['indic'].isin(target_indics) & new_stack['organisationunitid'].isin(valid_ids)].copy()

In [34]:
stack_t=pd.concat([old_stack_t,new_stack_t])
stack_t.reset_index(drop=True,inplace=True)
stack_t.head()

Unnamed: 0,district,organisationunitid,year,indic,month,value
0,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Apr,7.0
1,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Aug,7.0
2,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Dec,5.0
3,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Feb,
4,ZOMBO,XikHv88zzDn,2018,EPI - BCG doses given,Jan,


In [35]:
stack_t['indic'].unique()

array(['EPI - BCG doses given', 'EPI - DPT-HepB-HIB 1 doses given',
       'EPI - DPT-HepB-HIB 3 doses given', 'EPI - PCV 1 doses_Under 1',
       'EPI - PCV 3 doses_Under 1', 'EPI - MR 1 doses given',
       '105-MA01. Admissions', '105-MA04a. Deliveries in unit - Total',
       '105-PN01a. Post Natal Attendances - Mother',
       '105-AN01a. ANC 1st Visit for women',
       '105-AN02. ANC 4th Visit for women',
       '105-MA04b1. Deliveries in unit -Live births - Total',
       '105-MA04c1. Deliveries in unit - Fresh still birth - Total',
       '105-MA04d1. Deliveries in unit - Macerated still birth - Total',
       '105-MA11. Newborn deaths (0-7 days)',
       '105-NA03a1. Identified malnourished clients(<10) this month - MAM using MUAC',
       '105-NA03e1. Identified malnourished clients(<10) this month - SAM With Oedema',
       '105-NA03c1. Identified malnourished clients(<10) this month - SAM using MUAC -  Without Oedema',
       'Babies Born with low birth weight (<2.5Kgs)',


### Split reporting vars from others

In [36]:
report_indics=['HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Actual reports 1. National',
               'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Expected reports 1. National']

In [37]:
stack_t_noreport=stack_t[~stack_t['indic'].isin(report_indics)].copy()
stack_t_report=stack_t[stack_t['indic'].isin(report_indics)].copy()

# Flag outliers

## Put our data in the right format 


In [38]:
%%time

def pivot_stack(df):
    pivot_outliers=df.copy().pivot_table(index=['district', 'organisationunitid', 'indic'], columns=['year','month' ]) #,dropna=False)
    pivot_outliers.rename(columns={'value':'with_outiers'},level=0,inplace=True)
    pivot_outliers.columns.rename('type', level=0, inplace=True)
    pivot_outliers.dropna(how='all',axis=0,inplace=True) # looks like there is no all na line to drop
    return pivot_outliers

pivot_outliers=pivot_stack(stack_t_noreport)

Wall time: 7.39 s


## Replace outliers using a std deviation method

In [39]:
# Get the data in the right format

def replace_outliers(pivot_outliers,cutoff):#df
    
    pivot_no_outliers=pd.DataFrame(columns=pivot_outliers.columns,index=pivot_outliers.index)
    pivot_no_outliers.rename(columns={'with_outiers':'without_outliers'},level=0,inplace=True)
    
    for x in pivot_outliers.index: # to exclude
        values = pivot_outliers.loc[x,:].values
        if np.nanstd(values)!=0 and np.isnan(values).sum()!=len(values):
            zscore = abs(stats.zscore(values,nan_policy='omit'))
            new_values = np.where(zscore>cutoff,np.nanmedian(values),values)

        else:
            new_values = values

        pivot_no_outliers.iloc[pivot_outliers.index.get_loc(x),:] = new_values.astype('float')

    return pivot_no_outliers 


In [40]:
%%time
pivot_no_outliers = replace_outliers(pivot_outliers,cutoff=3)


  new_values = np.where(zscore>cutoff,np.nanmedian(values),values)


Wall time: 2min 28s


In [41]:
# Get the data in the right format

def replace_outliers_iqr(pivot_outliers,k):#df
    
    pivot_no_outliers=pd.DataFrame(columns=pivot_outliers.columns,index=pivot_outliers.index)
    pivot_no_outliers.rename(columns={'with_outiers':'without_outliers'},level=0,inplace=True)
    
    for x in pivot_outliers.index:
        values = pivot_outliers.loc[x,:].values
        if np.nanstd(values)!=0 and np.isnan(values).sum()!=len(values):
            Q1 = np.nanquantile(values,0.25)
            Q3 = np.nanquantile(values,0.75)
            IQR = Q3 - Q1
            LB = Q1 - k*IQR
            UB = Q3 + k*IQR
            new_values = np.where((values<LB)|(values>UB),np.nanmedian(values),values)

        else:
            new_values = values

        pivot_no_outliers.iloc[pivot_outliers.index.get_loc(x),:] = new_values.astype('float')

    return pivot_no_outliers 

In [42]:
%%time
pivot_no_outliers_iqr = replace_outliers_iqr(pivot_outliers,k=3)

  new_values = np.where((values<LB)|(values>UB),np.nanmedian(values),values)
  new_values = np.where((values<LB)|(values>UB),np.nanmedian(values),values)


Wall time: 2min 26s


## Stack the outlier corrected data

In [43]:
def pivot_stack(pivot):
    stack = pivot.stack(level=[0,1,2],dropna=False).reset_index()
    stack.rename(columns={0:'value'},inplace=True)
    stack.drop('type',axis=1,inplace=True)
    stack['value']=stack['value'].astype(dtype='float64')
    return stack

In [44]:
stack_t_noout=pivot_stack(pivot_no_outliers)
stack_t_noout_iqr=pivot_stack(pivot_no_outliers_iqr)


## Record which data points were changed

In [45]:
stack_compare = pd.merge(stack_t,stack_t_noout,how='inner',
                           left_on=['district', 'organisationunitid', 'year', 'indic', 'month'],
                           right_on=['district', 'organisationunitid', 'year', 'indic', 'month']).rename(columns={'value_x':'value_out','value_y':'value_noout'})
stack_compare.dropna(subset=['value_out','value_noout'],inplace=True)
stack_compare['changed']=np.where((stack_compare['value_out'] != stack_compare['value_noout']),True,False)
changed = stack_compare[stack_compare['changed']==True]

In [46]:
stack_compare_iqr = pd.merge(stack_t,stack_t_noout_iqr,how='inner',
                           left_on=['district', 'organisationunitid', 'year', 'indic', 'month'],
                           right_on=['district', 'organisationunitid', 'year', 'indic', 'month']).rename(columns={'value_x':'value_out','value_y':'value_noout'})
stack_compare_iqr.dropna(subset=['value_out','value_noout'],inplace=True)
stack_compare_iqr['changed']=np.where((stack_compare_iqr['value_out'] != stack_compare_iqr['value_noout']),True,False)
changed_iqr = stack_compare_iqr[stack_compare_iqr['changed']==True]

In [47]:
changed_iqr

Unnamed: 0,district,organisationunitid,year,indic,month,value_out,value_noout,changed
41,LUWERO,T4M9UgfqV5q,2019,EPI - BCG doses given,Jul,44.0,22.0,True
394,PADER,auzuV39xOTU,2018,EPI - BCG doses given,Oct,58.0,12.0,True
831,AMURIA,o7CJeTwDapk,2019,EPI - BCG doses given,Feb,20.0,4.0,True
928,MOYO,ZubUtSX9erU,2019,EPI - BCG doses given,Jan,6.0,2.0,True
1010,ABIM,ltIECAx2ppI,2018,EPI - BCG doses given,Dec,57.0,7.0,True
...,...,...,...,...,...,...,...,...
3827737,WAKISO,sokKY0XWigm,2020,OPD attendance,Jan,2156.0,475.0,True
3827745,WAKISO,t15uLLkZzQR,2020,OPD attendance,Apr,0.0,188.0,True
3827795,WAKISO,u3SLVGPpWI8,2020,OPD attendance,Apr,118020.0,1294.0,True
3827885,WAKISO,yApOnywci25,2020,OPD attendance,Apr,26.0,4970.0,True


In [48]:
changed.to_csv(output_path+'outliers_list.csv')
changed_iqr.to_csv(output_path+'outliers_list_iqr.csv')

# Export this to Tableau

In [49]:
# to check any data point below:

#stack_t[(stack_t['organisationunitid']=='JO1cLIghdBv') & 
        #(stack_t['year']=='2018') & 
        #(stack_t['indic']=='105-NA03e1. Identified malnourished clients(<10) this month - SAM With Oedema') & 
        #(stack_t['month']=='Apr')]['value'].notna()

In [50]:
#Put together my DHIS data with and without outlier

fac_stack_final = pd.merge(stack_t_noreport,stack_t_noout,how='left',
                           left_on=['district', 'organisationunitid', 'year', 'indic', 'month'],
                           right_on=['district', 'organisationunitid', 'year', 'indic', 'month']).rename(columns={'value_x':'value_out','value_y':'value_noout'})

fac_stack_final = pd.merge(fac_stack_final,stack_t_noout_iqr,how='left',
                           left_on=['district', 'organisationunitid', 'year', 'indic', 'month'],
                           right_on=['district', 'organisationunitid', 'year', 'indic', 'month']).rename(columns={'value':'value_noout_iqr'})

# Make a note of whcih facilities reported which didt 

fac_stack_final['reported'] = (fac_stack_final['value_out']>0).astype('int')

# Add in the reporting rate data, that did not go through theoutlier precocedure

stack_t_report.rename(columns={'value':'reported'},inplace=True)
stack_t_report.set_index(['district','organisationunitid','year' ,'indic','month'],inplace=True,drop=True)
#stack_t_report = stack_t_report.loc[~stack_t_report.index.duplicated(keep='first')] # Note here a weird issue of duplicates 
stack_t_report.reset_index(inplace=True)

#Puts it all together 

fac_stack_final=pd.concat([stack_t_report,fac_stack_final],ignore_index=True)

# Create a pivot

fac_pivot_final=fac_stack_final.pivot_table(index=['district','organisationunitid','year','month'], columns=['indic'],aggfunc=max)
fac_pivot_final=fac_pivot_final.stack(level=[0])

In [51]:
pivot_export=fac_pivot_final.copy()
pivot_export.to_csv(output_path+'corrected_data_facility.csv')

# Export the code for our framework

In [52]:
var_name_dict={'district':'id', 
               'date':'date',
               'organisationunitid':'facility_id', 
               'level_4':'type',
               '105-AN01a. ANC 1st Visit for women':'1st ANC Visits',
               '105-AN02. ANC 4th Visit for women':'4th ANC Visits',
               '105-MA01. Admissions' : 'Maternity Admissions',
               '105-MA04a. Deliveries in unit - Total': 'Deliveries in unit',
               '105-MA04b1. Deliveries in unit -Live births - Total': 'Deliveries in unit - live',
               '105-MA04c1. Deliveries in unit - Fresh still birth - Total': 'Deliveries in unit - fresh stillbirth',
               '105-MA04d1. Deliveries in unit - Macerated still birth - Total': 'Deliveries in unit - macerated stillbirth',
               '105-MA11. Newborn deaths (0-7 days)':'Newborn deaths',
               '105-NA03a1. Identified malnourished clients(<10) this month - MAM using MUAC' : 'MAM identified - MUAC',
               '105-NA03c1. Identified malnourished clients(<10) this month - SAM using MUAC -  Without Oedema':'SAM identified no oedema- MUAC',
               '105-NA03e1. Identified malnourished clients(<10) this month - SAM With Oedema':'SAM identified oedema- MUAC',
               '105-PN01a. Post Natal Attendances - Mother':'Postnatal Visits',
               'Babies Born with low birth weight (<2.5Kgs)' : 'Low weigh births', 
               '105-CH01. Vit A supplement (1st Dose)' : 'Vitamin A 1st dose',
               '105-CH02. Vit A supplement (2nd Dose)': 'Vitamin A 2nd dose', 
               'EPI - BCG doses given' : 'BCG',
               'EPI - DPT-HepB-HIB 1 doses given':'DPT1', 
               'EPI - DPT-HepB-HIB 3 doses given':'DPT3',
               'EPI - HPV1 doses given':'HPV1', 
               'EPI - HPV2 doses given':'HPV2',
               'EPI - MR 1 doses given':'MR1', 
               'EPI - PCV 1 doses_Under 1':'PCV1',
               'EPI - PCV 3 doses_Under 1':'PCV3',
               'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Actual reports 1. National' : 'Actual 105:1 reporting',
               'HMIS 105:01 - OPD Monthly Report (Attendance, Referrals, Conditions,TB, Nutrition) Expected reports 1. National': 'Expected 105:1 reporting',
               'td1':'TD1', 
               'td2':'TD2', 
               'td3':'TD3', 
               'td4_5':'TD4-5'}    

In [53]:
# 2. DATA PROCESSING: Clean the data into desired formats

data=pivot_export.reset_index()

# a. Combine the date into one column and format

dates = []
for index, row in data.iterrows():
    dates.append(str(row['year']) + '-' + row['month'])
data['date'] = dates

def parse_date(date):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    year, month = date.split('-')
    month = months.index(month)+1
    return datetime.date(year=int(year), month=month, day=1)

data['date'] = data.date.apply(parse_date)
data['date'] = pd.to_datetime(data.date)

# b. Select and rename the metric columns

data = data[list(var_name_dict.keys())]
data.columns = list(var_name_dict.values())
data.head()

# c. Get dataset of data grouped according to regions and years
data_2018 = data[(data.date.dt.year == 2018) & (data.date.dt.month == 3)]
data_2019 = data[(data.date.dt.year == 2019) & (data.date.dt.month == 3)]
data_2019 = data_2019.groupby('id').sum()
data_2018 = data_2018.groupby('id').sum()
data_yrs_reg = pd.merge(data_2018, data_2019, on='id',
                        suffixes=(' 2018', ' 2019'))
data_yrs_reg.head()

# d. Get dataset of change between years
for col in list(var_name_dict.values())[4:]:
    data_yrs_reg[col] = round(
        ((data_yrs_reg[f'{col} 2019'] - data_yrs_reg[f'{col} 2018']) / data_yrs_reg[f'{col} 2018'])*100, 2)
data_change_reg = data_yrs_reg.reset_index()#[['id', '1st ANC Visits', '4th ANC Visits',
                                              #'Maternity Admissions', 'Postnatal Visits', '3rd Dose DTaP-HB-IPV-Hib', '1st Dose MR']]
data_change_reg.head()

# e. Get dataset of data grouped according to date (on national level)
data_date = data.groupby('date').sum()
data_date.head()

Unnamed: 0_level_0,1st ANC Visits,4th ANC Visits,Maternity Admissions,Deliveries in unit,Deliveries in unit - live,Deliveries in unit - fresh stillbirth,Deliveries in unit - macerated stillbirth,Newborn deaths,MAM identified - MUAC,SAM identified no oedema- MUAC,SAM identified oedema- MUAC,Postnatal Visits,Low weigh births,Vitamin A 1st dose,Vitamin A 2nd dose,BCG,DPT1,DPT3,HPV1,HPV2,MR1,PCV1,PCV3,Actual 105:1 reporting,Expected 105:1 reporting,TD1,TD2,TD3,TD4-5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2018-01-01,0.0,176038.5,0.0,2224.0,267340.0,0.0,2944.0,264205.5,0.0,12016.5,2799.0,646411.5,0.0,0.0,151005.0,0.0,0.0,0.0,25255.0,8500.0,0.0,0.0,0.0,4890.0,4985.0,442583.5,229105.5,53780.5,45284.0
2018-02-01,423433.5,165131.5,316484.5,1995.0,250590.5,2652.0,2599.0,247078.5,31538.5,12831.5,3429.0,625859.0,13285.0,380820.5,141497.0,361895.0,391185.5,376936.5,50387.5,22070.0,343087.0,387099.0,367292.5,4890.0,4997.0,399453.0,231508.0,53489.0,43827.0
2018-03-01,425727.0,179268.0,352734.5,2460.5,280040.0,2844.0,2993.0,274785.5,35648.5,12256.5,2911.0,676554.0,14739.0,400601.5,162795.5,398780.5,406776.5,396325.5,67353.5,27876.0,394910.0,395105.0,385444.0,4878.0,4998.0,422480.0,256159.0,59011.5,47952.0
2018-04-01,447566.5,185965.5,345645.5,1750.5,269428.0,2872.0,2687.0,264581.5,36359.5,13936.0,5002.0,660337.5,15916.0,2258657.5,2274629.0,412455.5,404305.5,397940.5,869636.5,389505.0,393164.5,396682.0,385362.5,4919.0,5040.0,730369.5,371690.0,101485.0,78466.0
2018-05-01,518355.5,200722.5,359989.0,2047.0,281531.5,2733.0,2874.0,276025.0,42941.5,14843.5,3500.0,715422.0,16113.0,551044.0,285561.5,425086.0,430737.0,401346.5,46129.0,29360.0,392128.5,422673.0,389803.5,4998.0,5044.0,452844.5,262885.0,61220.5,51857.0


In [54]:
facility_data_reporting = data[data['type']=='reported']
facility_data_with_outliers = data[data['type']=='value_out']
facility_data_no_outliers_std = data[data['type']=='value_noout']
facility_data_no_outliers_iqr = data[data['type']=='value_noout_iqr']

In [55]:
facility_data_reporting.to_csv(output_path+'facility_data_reporting.csv')
facility_data_with_outliers.to_csv(output_path+'facility_data_with_outliers.csv')
facility_data_no_outliers_std.to_csv(output_path+'facility_data_no_outliers_std.csv')
facility_data_no_outliers_iqr.to_csv(output_path+'facility_data_no_outliers_iqr.csv')