In [2]:
import pandas as pd
import numpy as np
from pandas import ExcelWriter
import openpyxl
import geopandas as gpd
import math

In [3]:
input_path='../../data/input/dhis2/new_system/'
output_path='../../data/output/sprint3_analysis/'
shapes_path='../../data/shapes/district/districts_17_19.shp'
input_path_old='../../data/input/dhis2/old_system/'

# Some functions I'll use

In [82]:
shapes = gpd.read_file(shapes_path)

# Creating a dict of names to replace dsirtcit names

def get_district_name_dict (df):
    keys = list(set(df.index).difference(set(shapes['name_19'])))
    values = list(set(shapes['name_19']).difference(set(df.index)))
    district_name = dict(zip(keys, values))
    return district_name 


In [118]:
districts = list(pd.read_csv(input_path+'districts.csv')["Districts"])

# To clean the data downloaded by Nancy in a stacked format

def get_clean_pivot_from_stack(indicator,jan,feb,mar,apr):
    df = pd.DataFrame(districts).reset_index(drop=True)

    Jan = jan[jan['Data']==indicator].drop('Data', axis=1)
    Feb = feb[feb['Data']==indicator].drop('Data', axis=1)
    March = mar[mar['Data']==indicator].drop('Data', axis=1)
    April = apr[apr['Data']==indicator].drop('Data', axis=1)
    

    df_final = Jan.merge(Feb,on='Organisation unit', how='outer').merge(March,on='Organisation unit', how='outer').merge(April,on='Organisation unit', how='outer')
    df_final.set_index('Organisation unit',drop=True, inplace=True)
    
    columns = [('2020',indicator,"Jan"), ('2020',indicator,"Feb"),('2020',indicator,"Mar"),('2020',indicator,"Apr")]
    df_final.columns=pd.MultiIndex.from_tuples(columns)
    
    return df_final


In [16]:
# build a small fucntion to split the string column name of the data download as pivot 

def split(strng, sep, occ):
    strng = strng.split(sep)
    return sep.join(strng[occ[1]:]), sep.join(strng[:occ[0]]), sep.join(strng[occ[0]:occ[1]])[:3]

In [17]:
# To clean the data downloaded in a pivot format

def get_clean_pivot(df):
    df['district']=df['organisationunitname'].apply(lambda x: x[:-9].upper())
    df.set_index('district',drop=True,inplace=True)
    cols = np.arange(0,4)
    df.drop(df.columns[cols],axis=1,inplace=True)
    df.rename(index=get_district_name_dict(df),inplace=True)
    cols=df.columns
    new_cols=[]
    for col in cols:
        new_cols.append(split(col,' ',[-2,-1]))
    df.columns=pd.MultiIndex.from_tuples(new_cols)
    return df

# Extract maternal death

In [115]:
df_Jan = pd.read_csv(input_path+'Mat_Post_jan.csv')
df_Feb = pd.read_csv(input_path+'Mat_Post_feb.csv')
df_Mar = pd.read_csv(input_path+'Mat_Post_mar.csv')
df_Apr = pd.read_csv(input_path+'Mat_Post_apr.csv')

In [116]:
# The full list of indicators
indicators = df_Jan['Data'].unique()

#The one indicator we are interested in
mnch_target = '105-MA13. Maternal deaths'

In [127]:
#Extract and format the data 

multi_index=pd.MultiIndex.from_tuples([('Test','test','test')])
mat_death=pd.DataFrame(index=districts,columns=multi_index)

df_months = get_clean_pivot_from_stack(indicator = mnch_target,
                                      jan = df_Jan,
                                      feb = df_Feb,
                                      mar = df_Mar,
                                      apr = df_Apr)

mat_death = pd.merge(mat_death, df_months, how='left',left_index=True,right_index=True) 

mat_death.drop('test',axis=1,inplace=True,level=2)



In [128]:
mat_death['district']=mat_death.index.map(lambda x: x[:-9].upper())
mat_death.set_index('district',drop=True,inplace=True)


In [131]:
mat_death.rename(index=get_district_name_dict(mat_death),inplace=True)

# Extract SAM admissions indicators

In [137]:
#The few indicators we are interested in

sam_target=['108-NA01a1. Total number of SAM admissions in ITC',
            '105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC']
            #'105-NA04a1. SAM admissions(<10) in OTC this month - Old']

In [138]:
# Extract and clean

new_sam=pd.read_csv(input_path+'sam_data_elements_pivot.csv')
new_sam=get_clean_pivot(new_sam)
dropped_months=['Nov','Dec']
new_sam.drop(dropped_months,axis=1,inplace=True,level=2)


In [139]:
# Get only the 4 indicators we are interesetd in 

sam_adm = new_sam.loc[:,('2020',sam_target,['Jan','Feb','Mar','Apr'])]
sam_adm.head()

Unnamed: 0_level_0,2020,2020,2020,2020,2020,2020,2020,2020
Unnamed: 0_level_1,108-NA01a1. Total number of SAM admissions in ITC,108-NA01a1. Total number of SAM admissions in ITC,108-NA01a1. Total number of SAM admissions in ITC,108-NA01a1. Total number of SAM admissions in ITC,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC
Unnamed: 0_level_2,Jan,Feb,Mar,Apr,Jan,Feb,Mar,Apr
district,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
ABIM,3.0,1.0,5.0,6.0,20.0,33.0,28.0,44.0
ADJUMANI,13.0,4.0,1.0,6.0,16.0,38.0,20.0,19.0
AGAGO,31.0,4.0,2.0,7.0,3.0,4.0,6.0,5.0
ALEBTONG,,,,,8.0,,,
AMOLATAR,,,,,1.0,2.0,,0.0


# Look at reporting rates

In [140]:
sam_adm = new_sam.loc[:,('2020',sam_target,['Jan','Feb','Mar','Apr'])]

In [141]:
null_sum1= mat_death.count()/135
null_sum2= sam_adm.count()/135
null_sum=pd.concat([null_sum1,null_sum2])
null_sum

2020  105-MA13. Maternal deaths                                           Jan    0.666667
                                                                          Feb    0.614815
                                                                          Mar    0.688889
                                                                          Apr    0.637037
      108-NA01a1. Total number of SAM admissions in ITC                   Jan    0.451852
                                                                          Feb    0.474074
                                                                          Mar    0.459259
                                                                          Apr    0.459259
      105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC  Jan    0.659259
                                                                          Feb    0.696296
                                                                          Mar    0.644444
          

# Look at data variations

 Calculate deltas

In [142]:
data_quality =pd.merge(sam_adm,mat_death,how='inner',left_index=True, right_index=True)

In [155]:
np.mean([1,2,3])

2.0

In [167]:
data_quality

Unnamed: 0_level_0,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020
Unnamed: 0_level_1,108-NA01a1. Total number of SAM admissions in ITC,108-NA01a1. Total number of SAM admissions in ITC,108-NA01a1. Total number of SAM admissions in ITC,108-NA01a1. Total number of SAM admissions in ITC,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC,105-MA13. Maternal deaths,105-MA13. Maternal deaths,105-MA13. Maternal deaths,105-MA13. Maternal deaths,105-MA13. Maternal deaths,105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC,108-NA01a1. Total number of SAM admissions in ITC
Unnamed: 0_level_2,Jan,Feb,Mar,Apr,Jan,Feb,Mar,Apr,Jan,Feb,Mar,Apr,min,min,min
district,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
ABIM,3.0,1.0,5.0,6.0,20.0,33.0,28.0,44.0,,,0.0,0.0,0.0,20.0,1.0
ADJUMANI,13.0,4.0,1.0,6.0,16.0,38.0,20.0,19.0,1.0,0.0,,,0.0,16.0,1.0
AGAGO,31.0,4.0,2.0,7.0,3.0,4.0,6.0,5.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0
ALEBTONG,,,,,8.0,,,,,,0.0,,0.0,8.0,
AMOLATAR,,,,,1.0,2.0,,0.0,0.0,,,,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOROTI,5.0,8.0,11.0,5.0,1.0,,,0.0,3.0,2.0,2.0,0.0,0.0,0.0,5.0
TORORO,18.0,17.0,11.0,3.0,1.0,1.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0
WAKISO,4.0,3.0,,,14.0,21.0,17.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0
YUMBE,20.0,18.0,29.0,13.0,30.0,30.0,31.0,46.0,,,2.0,0.0,0.0,30.0,13.0


In [181]:
# get the Jan-Apr deltas for 2019

indicators = list(data_quality.columns.levels[1])


for i in indicators:
    data_quality[('2020',i,'min')] =  data_quality.loc[:,('2020',i,['Jan','Feb','Mar','Apr'])].min(axis=1)
    data_quality[('2020',i,'max')] =  data_quality.loc[:,('2020',i,['Jan','Feb','Mar','Apr'])].max(axis=1)
    data_quality[('2020',i,'mean')] =  data_quality.loc[:,('2020',i,['Jan','Feb','Mar','Apr'])].mean(axis=1)
    data_quality[('2020',i,'std')] =  data_quality.loc[:,('2020',i,['Jan','Feb','Mar','Apr'])].std(axis=1)#,skipna=False)
    data_quality[('2020',i,'std_norm')] =  data_quality[('2020',i,'std')]/data_quality[('2020',i,'mean')]
        

In [182]:
data_quality.loc['PAKWACH',('2020',indicators,'std')]

2020  105-MA13. Maternal deaths                                           std    0.57735
      105-NA04b1. SAM admissions(<10) in OTC this month - New using MUAC  std        NaN
      108-NA01a1. Total number of SAM admissions in ITC                   std        NaN
Name: PAKWACH, dtype: float64

## Get that to csv

In [183]:
data_quality_export=data_quality.copy()
data_quality_export.columns=data_quality.columns.map("_".join)
data_quality_export.to_csv(output_path+'data_quality_map.csv')

Note that I am not looking at old data here, because I coul not find teh corresponding indicators execpt for two of them