In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../_data/tb-rsqa-cleaned.csv')


In [3]:
df['fy_year'] = df['fy'].str.extract(r'(\d{4})').astype(int)
df['fy_year'].unique()

array([2020, 2021, 2022, 2023])

In [4]:
df.columns

Index(['province', 'district', 'hf_tb_type', 'hf_name', 'hf_type', 'fy',
       'summary_group', 'no', 'normes', 'item_name', 'num', 'den',
       'genexpert_or_microscopy', 'fy_year'],
      dtype='object')

In [5]:
df.loc[:, 'province'] = df['province'].str.lower()


In [6]:
df['province'].unique()

array(['north ', 'east ', 'south ', 'west ', 'sourthern', 'east', 'sud',
       'nord', 'west', 'kigali city'], dtype=object)

In [7]:
df.loc[(df['province'] == 'sourthern'), 'province'] = 'south'
df.loc[(df['province'] == 'south '), 'province'] = 'south'
df.loc[(df['province'] == 'sud'), 'province'] = 'south'
df.loc[(df['province'] == 'north '), 'province'] = 'north'
df.loc[(df['province'] == 'nord'), 'province'] = 'north'
df.loc[(df['province'] == 'east '), 'province'] = 'east'
df.loc[(df['province'] == 'kigali city'), 'province'] = 'kigali'
df.loc[(df['province'] == 'west '), 'province'] = 'west'


In [8]:
summary_item_scores = (
    df.groupby(
        ['province', 'summary_group', "genexpert_or_microscopy", 'item_name', 'hf_name', 'hf_tb_type', 'hf_type', 'fy']
    )
    .agg({'num': 'sum', 'den': 'sum'})
    .reset_index()
)

In [9]:
# Define mapping of summary_group to section
section_mapping = {
    "Is active case finding conducted in TB high risk groups?[Questions 2, 8 and 9]": "screening",
    "Are TB diagnostics continuously functional (microscopy and Expert) and their results available timely (microscopy, expert and culture)?[Questions 3, 4, 5, 6 and 7]": "diagnosis",
    "Are TB patients early initiated on TB treatment and on ART (if indicated), and their bacteriological control performed according to guidelines?[Questions 16, 17, 18 and 19]": "treatment",
    "Does the BMI monitored for TB Patients and nutritional support provided to the eligible patients?[Questions New_70 and New_71]": "treatment"
}

summary_item_scores["section"] = summary_item_scores["summary_group"].map(section_mapping)

In [10]:
# Compute the average score as (sum of num / sum of den) * 100
summary_item_scores['avg_score'] = (summary_item_scores['num'] / summary_item_scores['den']) * 100

In [11]:
filtered_summary = summary_item_scores[summary_item_scores["section"].isin(["screening", "treatment", "diagnosis"])]

In [12]:
filtered_summary.columns

Index(['province', 'summary_group', 'genexpert_or_microscopy', 'item_name',
       'hf_name', 'hf_tb_type', 'hf_type', 'fy', 'num', 'den', 'section',
       'avg_score'],
      dtype='object')

In [13]:
filtered_summary['hf_type'].unique()

array(['health center', 'hospital (dh,ph,rh)'], dtype=object)

In [14]:
microscopy_only_df = filtered_summary[(filtered_summary['hf_tb_type'] == 'CDT') & (filtered_summary['hf_type'] == "health center")]
microscopy_only_df.shape

(1970, 12)

In [15]:
filtered_summary[(filtered_summary['hf_tb_type'] == 'CT') & (filtered_summary['hf_type'] == "health center")][['hf_tb_type', 'hf_type', 'hf_name']]

Unnamed: 0,hf_tb_type,hf_type,hf_name
357,CT,health center,cyabayaga cs
358,CT,health center,cyabayaga cs
360,CT,health center,cyondo cs
362,CT,health center,gahini cs
363,CT,health center,gahini cs
...,...,...,...
22761,CT,health center,nyakarenzo cs
22763,CT,health center,rubavu prison
22765,CT,health center,rufungo cs
22766,CT,health center,rugabano cs


In [16]:
filtered_summary[
    (filtered_summary['item_name'] == "Pour les sites Xpert, sur les 10 derniers patients présumés TB éligibles pour Xpert et dont leurs échantillons ont été examinés par Xpert, combien ont  les résultats Xpert disponibles endéans 3 jours de réception de l'échantillon?  ")
     |
    (filtered_summary['item_name'] == "Pour les sites Xpert, durant le mois passé, combien de jours la machine Xpert était opérationnelle ?") & (filtered_summary['genexpert_or_microscopy'] == "no") ].shape

(559, 12)

In [17]:
# Define the items to exclude
items_to_exclude = [
    "Sur 10 nouveaux prisonniers entrÃ©s au cours de la pÃ©riode evaluÃ©e, combien ont beneficiÃ© du screening TB Ã¡ l'entrÃ©e",
    "Sur 10 nouveaux prisonniers sortants au cours de la pÃ©riode evaluÃ©e, combien ont beneficiÃ© du screening TB Ã¡ la sortie",
    "Sur les 10 derniers patients hospitalises ( 5 pediatrie,5Medecine interne), combien ont beneficie  du screening de la TB et prise en charge selon le protocole? ",
]

items_to_exclude_on_diagnostic = [
    "Pour les sites Xpert, sur les 10 derniers patients présumés TB éligibles pour Xpert et dont leurs échantillons ont été examinés par Xpert, combien ont  les résultats Xpert disponibles endéans 3 jours de réception de l'échantillon?  ",
    "Pour les sites Xpert, durant le mois passé, combien de jours la machine Xpert était opérationnelle ?"
]

# Filter out rows where item_name is in items_to_exclude
filtered_summary = filtered_summary[~filtered_summary["item_name"].isin(items_to_exclude)]

-- investigation block ---

In [18]:
df.columns

Index(['province', 'district', 'hf_tb_type', 'hf_name', 'hf_type', 'fy',
       'summary_group', 'no', 'normes', 'item_name', 'num', 'den',
       'genexpert_or_microscopy', 'fy_year'],
      dtype='object')

In [19]:
filtered_summary[filtered_summary['section'] == 'diagnosis'][['hf_tb_type', 'hf_type', 'hf_name']]

Unnamed: 0,hf_tb_type,hf_type,hf_name
356,CDT,health center,nyagatare dh
357,CT,health center,cyabayaga cs
358,CT,health center,cyabayaga cs
359,CDT,health center,cyarubare cs
360,CT,health center,cyondo cs
...,...,...,...
20661,CDT,"hospital (dh,ph,rh)",mibilizi dh
20662,CDT,"hospital (dh,ph,rh)",mugonero dh
20663,CDT,"hospital (dh,ph,rh)",mugonero dh
20664,CDT,"hospital (dh,ph,rh)",murunda dh


-- investigation block ---

In [20]:
filtered_summary.columns

Index(['province', 'summary_group', 'genexpert_or_microscopy', 'item_name',
       'hf_name', 'hf_tb_type', 'hf_type', 'fy', 'num', 'den', 'section',
       'avg_score'],
      dtype='object')

In [21]:
filtered_summary[filtered_summary['item_name'] == "Sur les 10 derniers patients hospitalises ( 5 pediatrie,5Medecine interne), combien ont beneficie  du screening de la TB et prise en charge selon le protocole? "]

Unnamed: 0,province,summary_group,genexpert_or_microscopy,item_name,hf_name,hf_tb_type,hf_type,fy,num,den,section,avg_score


In [22]:
filtered_summary[filtered_summary['section'] == 'diagnosis']['item_name'].unique()

array(['Durant le mois passé, combien de jours le microscope optique ou LED était opérationnel ?',
       'Pour les sites Xpert, durant le mois passé, combien de jours la machine Xpert était opérationnelle ?',
       "Pour les sites Xpert, sur les 10 derniers patients présumés TB éligibles pour Xpert et dont leurs échantillons ont été examinés par Xpert, combien ont  les résultats Xpert disponibles endéans 3 jours de réception de l'échantillon?  ",
       "Sur les 10 derniers cas presumes TB, combien ont beneficies des examens d'investigations (un ou plusieurs examens selon le cas).",
       'Sur les 10 derniers patients hospitalises( 5 pediatrie,5Medecine interne), combien ont beneficie  du screening de la TB et prise en charge selon le protocole? ',
       'Sur les échantillons reçus   il y a 3 jours, vérifier si les résultats sont mentionnés dans le registre de lab',
       'Vérifier  pour les 5 derniers cas éligibles à la Culture\xa0 et DST si les résultats sont disponibles avec bo

In [23]:
# Define valid cases
valid_df = filtered_summary[
    (filtered_summary["num"] <= filtered_summary["den"]) &  # num must not be greater than den
    ~(filtered_summary["num"].isna() & filtered_summary["den"] > 0) &  # Exclude cases where num is NaN but den > 0
    ~((filtered_summary["num"] == 0) & (filtered_summary["den"] == 0)) &  # Exclude cases where both num and den are 0
    ~(filtered_summary["den"].isna())  # Exclude cases where den is NaN (new fix)
]

invalid_df = filtered_summary[
    (filtered_summary["num"] > filtered_summary["den"]) |  # Invalid if num > den
    ((filtered_summary["num"].isna()) & (filtered_summary["den"] > 0)) |  # Invalid if num is NaN but den > 0
    ((filtered_summary["num"] == 0) & (filtered_summary["den"] == 0)) |  # Invalid if both num and den are 0
    (filtered_summary["den"].isna()) |  # Invalid if den is NaN
    ((filtered_summary["num"].isna()) & (filtered_summary["den"] == 0))  # NEW FIX: Invalid if num is NaN and den = 0
]



In [24]:
print("valid size: ", valid_df.shape)
print("invalid size: ", invalid_df.shape)

valid size:  (6000, 12)
invalid size:  (699, 12)


In [None]:
stop

In [25]:
valid_df.columns

Index(['province', 'summary_group', 'genexpert_or_microscopy', 'item_name',
       'hf_name', 'hf_tb_type', 'hf_type', 'fy', 'num', 'den', 'section',
       'avg_score'],
      dtype='object')

In [26]:
# First calculate the overall_score as you were doing
fy_summary = valid_df.groupby(["province", "genexpert_or_microscopy", "hf_tb_type", 'section', 'hf_type', 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = valid_df.groupby(["province", "genexpert_or_microscopy", "hf_tb_type", 'section', 'hf_type', 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["province", "genexpert_or_microscopy", "hf_tb_type", 'section', 'hf_type', 'fy'], how="left")

In [27]:
result_df['province'].unique()

array(['east', 'kigali', 'north', 'south', 'west'], dtype=object)

`Overall score and count by provice`

In [28]:
# First calculate the overall_score as you were doing
fy_summary = valid_df.groupby(["province", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = valid_df.groupby(["province", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["province", 'fy'], how="left")
result_df 

Unnamed: 0,province,fy,overall_score,hf_count
0,east,2020-2021,73.58169,34
1,east,2021-2022,49.876758,41
2,east,2023-2024,49.062934,43
3,kigali,2023-2024,56.275862,17
4,north,2020-2021,63.286945,25
5,north,2023-2024,53.051714,34
6,south,2020-2021,68.149211,36
7,south,2021-2022,60.967796,49
8,south,2023-2024,65.794907,9
9,west,2022-2023,44.524561,39


In [29]:
# First calculate the overall_score as you were doing
fy_summary = valid_df.groupby(["province", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = valid_df.groupby(["province", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["province", 'fy'], how="left")
result_df 

Unnamed: 0,province,fy,overall_score,hf_count
0,east,2020-2021,73.58169,34
1,east,2021-2022,49.876758,41
2,east,2023-2024,49.062934,43
3,kigali,2023-2024,56.275862,17
4,north,2020-2021,63.286945,25
5,north,2023-2024,53.051714,34
6,south,2020-2021,68.149211,36
7,south,2021-2022,60.967796,49
8,south,2023-2024,65.794907,9
9,west,2022-2023,44.524561,39


In [30]:
# result_df.to_csv('../output/group_by_province.csv', index=False)

`Overall score and count by 'Health facility type'`

In [31]:
# First calculate the overall_score as you were doing
fy_summary = valid_df.groupby(["hf_type", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = valid_df.groupby(["hf_type", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["hf_type", 'fy'], how="left")
result_df 

Unnamed: 0,hf_type,fy,overall_score,hf_count
0,health center,2020-2021,64.8971,74
1,health center,2021-2022,54.702987,69
2,health center,2022-2023,40.765172,31
3,health center,2023-2024,44.139361,69
4,"hospital (dh,ph,rh)",2020-2021,81.016334,21
5,"hospital (dh,ph,rh)",2021-2022,66.345062,21
6,"hospital (dh,ph,rh)",2022-2023,59.289415,8
7,"hospital (dh,ph,rh)",2023-2024,67.406334,42


In [32]:
# result_df.to_csv('../output/group_by_hf_type.csv', index=False)

`Overall score and count by 'Tb facility type'`

In [33]:
# First calculate the overall_score as you were doing
fy_summary = valid_df.groupby(["hf_tb_type", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = valid_df.groupby(["hf_tb_type", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["hf_tb_type", 'fy'], how="left")
result_df 

Unnamed: 0,hf_tb_type,fy,overall_score,hf_count
0,CDT,2020-2021,81.470981,49
1,CDT,2021-2022,60.677659,50
2,CDT,2022-2023,51.752691,21
3,CDT,2023-2024,62.769719,71
4,CT,2020-2021,54.547166,46
5,CT,2021-2022,52.615454,40
6,CT,2022-2023,35.89321,18
7,CT,2023-2024,37.23137,41


In [34]:
# result_df.to_csv('../output/group_by_hf_tb_type.csv', index=False)

`Overall score and count by 'Tb facility type'`

In [35]:
# First calculate the overall_score as you were doing
fy_summary = valid_df.groupby(["genexpert_or_microscopy", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = valid_df.groupby(["genexpert_or_microscopy", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["genexpert_or_microscopy", 'fy'], how="left")
result_df 

Unnamed: 0,genexpert_or_microscopy,fy,overall_score,hf_count
0,no,2020-2021,63.169926,69
1,no,2021-2022,54.328026,64
2,no,2022-2023,39.679567,28
3,no,2023-2024,41.644811,61
4,yes,2020-2021,81.794349,26
5,yes,2021-2022,65.310786,26
6,yes,2022-2023,57.104268,11
7,yes,2023-2024,66.879289,50


In [36]:
# result_df.to_csv('../output/group_by_genexpert_or_microscopy.csv', index=False)