In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
__version__='0.1.0'

In [2]:
from pandas import DataFrame, read_excel
import re
import random

In [3]:
def clean_text(text):
    text = re.sub(r'\n\d+', '', text)  # Remove line breaks followed by digits
    text = re.sub(r'\n+', '', text)   # Replace lines breaks with a space
    text = text.replace('nan', '')    # Optional: To handle the literal string 'nan'
    text = text.replace('/', '')    # Optional: To handle the literal string 'nan'
    return text

In [4]:
class DataGenerator:
    unit_rules = {
        '-': [0, 0],
        '%': [20, 100],
        'volunteer hours / club / month': [1, 100],
        'people': [1, 100],
        'Training materials': [1, 20],
        'Hours of training': [1, 40],
        'communes': [1, 12],
        'events/month': [1, 5],
        'water systems': [1, 3],
        'Days': [7, 21],
        'water points': [1, 200],
        'communities': [1, 200],
        'toilet blocks': [1, 150],
        'animators': [1, 100],
        'Hours': [1, 100],
        'schools': [1, 200],
        'healthcare facilities': [1, 200],
        'USD': [500, 20000],
        'trainings': [1, 100]
    }

    @staticmethod
    def generate_values(unit):
        min_val, max_planned_value = DataGenerator.unit_rules[unit]

        if unit == '-':
            return '-', '-', '-'
        elif unit == '%':
            planned_value = max_planned_value
            if min_val < planned_value:
                baseline_value = random.randint(min_val, planned_value - 1)
                current_value = random.randint(baseline_value + 1, planned_value)
            else:
                baseline_value = planned_value
                current_value = planned_value
            return baseline_value, planned_value, current_value
        else:
            # Ensure planned_value computation is within valid range
            if max_planned_value - min_val >= 100:
                planned_value = random.randint(min_val + 100, max_planned_value)
            else:
                planned_value = random.randint(min_val, max_planned_value)

            # Generate baseline_value within a valid range
            if min_val < planned_value:
                baseline_value = random.randint(min_val, planned_value - 1)
                
                # Generate current_value within a valid range
                if baseline_value + 1 < planned_value:
                    current_value = random.randint(baseline_value + 1, planned_value - 1) if baseline_value + 1 < planned_value else baseline_value + 1
                else:
                    current_value = baseline_value + 1
            else:
                baseline_value = planned_value
                current_value = planned_value
                
            return baseline_value, planned_value, current_value

## Get the data sheet

In [5]:
data = read_excel('hanwash_indicators.xlsx',sheet_name='data',parse_dates=True)
data.columns.to_list()

['Type',
 'Code',
 'Description',
 'Code_index',
 'Indicator',
 'Definitions',
 'Units',
 'Status',
 'Program',
 '(Dis)Aggregation',
 'Source_link',
 'Enumerator',
 'Frequency_time',
 'Visualization_link',
 'Source',
 'Validator',
 'Frequency',
 'Visualization',
 'Notes']

In [6]:

def fix_outcome(df):
    if df['Category'] == "Intermediate outcome" or df['Category'] == "Ultimate outcome":
        return df['Category_code']
    else:
        return df['index']



## Parse to the desired format


In [7]:
indicator_parse = DataFrame()

In [8]:
indicator_parse["ID"] = data.Code_index
indicator_parse["Indicator_group"] = data.Program
indicator_parse["Category"] = data.Type
indicator_parse["Category_code"] = data.Code
indicator_parse["Category_code_description"] = data.Description
indicator_parse["Indicator"] = data.Indicator
indicator_parse["Definitions"] = data.Definitions
indicator_parse['Units'] = data.Units
indicator_parse['index'] = data.Code_index



In [9]:
indicator_parse.Definitions = indicator_parse.Definitions.astype(str)
indicator_parse.Definitions = indicator_parse.Definitions.str.replace('"', '')
indicator_parse.Definitions = indicator_parse.Definitions.apply(clean_text)
indicator_parse.Units= indicator_parse['Units'].apply(lambda x: '%' if '%' in x else x)

In [10]:
indicator_parse=indicator_parse.applymap(lambda x: x.strip() if type(x)==str else x)

In [11]:
# Apply the function to create new columns
indicator_parse[['Baseline_value', 'Planned_value', 'Current_value']] = indicator_parse.apply(
    lambda row: DataGenerator.generate_values(row['Units']), 
    axis=1, 
    result_type='expand'
)


In [12]:
indicator_parse['ID'] = indicator_parse.apply(
    lambda df: fix_outcome(df), 
    axis=1, 
    result_type='expand'
)

In [13]:
indicator_parse.drop(columns=['index'],axis=1,inplace=True)

In [14]:
indicator_parse.fillna("-",inplace=True)

In [15]:
indicator_parse.head(15)

Unnamed: 0,ID,Indicator_group,Category,Category_code,Category_code_description,Indicator,Definitions,Units,Baseline_value,Planned_value,Current_value
0,1,Mattson,Ultimate outcome,1,"Increased access to safe, sustainable and affo...",-,-,-,-,-,-
1,1000,Mattson,Intermediate outcome,1000,Strengthened demand for and management of WASH...,-,-,-,-,-,-
2,1100a,Mattson,Immediate outcome,1100,Increased civil society engagement in decision...,% of commune WASH events with Rotarian partici...,-,%,54,100,86
3,1100b,Mattson,Immediate outcome,1100,Increased civil society engagement in decision...,% of commune WASH events led by trained HANWAS...,…led by Rotarians… means the WASH activities w...,%,34,100,98
4,1110a,Mattson,Outputs,1110,Support civil society engagement in the WASH s...,# of volunteer hours per month spent by Rotari...,,volunteer hours / club / month,3,18,15
5,1110b,Mattson,Outputs,1110,Support civil society engagement in the WASH s...,Number of people reached by the HANWASH Ambass...,Number of people reached…: Number of people pa...,people,36,49,37
6,1120a,Mattson,Outputs,1120,Train HANWASH Ambassadors & Coordinators and t...,# of ambassadors who have completed training,-,people,23,86,65
7,1120b,Mattson,Outputs,1120,Train HANWASH Ambassadors & Coordinators and t...,# of coordinators who have completed training,-,people,33,47,43
8,1120c,Mattson,Outputs,1120,Train HANWASH Ambassadors & Coordinators and t...,# of training materials created in alignment w...,-,Training materials,5,15,6
9,1120d,Mattson,Outputs,1120,Train HANWASH Ambassadors & Coordinators and t...,# of hours of training received by the HANWAS...,Number of hours and number of people attending,Hours of training,5,32,8


In [17]:
indicator_parse.to_excel("Hanwash_new_Indicators.xlsx",index=False)

In [20]:
indicator_parse[
    (indicator_parse.Indicator != "-")
].to_excel("indicator_review.xlsx",index=False)