In [185]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

1. Load data

In [186]:
df = pd.read_csv("../_data/tb-rsqa.csv", encoding="ISO-8859-1")
df = df.copy()

2. Extra relevant columns

In [187]:
relevant_columns = ["Province", "District", "HF TB Type", "HF Name", "HF Type", "FY", "Summary group", "NO", "Normes ", "Element à verifier", "Num", "Den"]
new_df = df[relevant_columns]


3. Data cleaning

checking & handling duplicates

In [188]:
duplicates = new_df[new_df.duplicated()]
print("No duplicates: ", duplicates.shape[0])
new_df = new_df.drop_duplicates()

No duplicates:  176


checking & handling missing by replacement

In [189]:
print("No. missing values: ", new_df.isnull().sum())
new_df["Element à verifier"] = new_df["Element à verifier"].fillna(new_df["Normes "])

No. missing values:  Province                 0
District                 0
HF TB Type               0
HF Name                  0
HF Type                  0
FY                       0
Summary group            0
NO                       0
Normes                9711
Element à verifier    1089
Num                   4613
Den                   2201
dtype: int64


renaming columns

In [190]:
new_df.rename(columns={
    'Province': 'province',
    'District': 'district',
    'HF Name': 'hf_name',
    'HF Type': 'hf_type',
    'HF TB Type': 'hf_tb_type',
    'Summary group': 'summary_group',
    'Normes ': 'normes',
    'Element à verifier': 'item_name',
    'FY': 'fy',
    'NO': 'no',
    'Num': 'num',
    'Den': 'den',
}, inplace=True)

clean `province` type

In [191]:
new_df['province'].unique()

array(['North ', 'East ', 'South ', 'West ', 'SOURTHERN', 'East', 'EAST',
       'SUD', 'NORD', 'WEST', 'Kigali city'], dtype=object)

In [192]:
new_df.loc[(new_df['province'] == 'north '), 'province'] = 'north'
new_df.loc[(new_df['province'] == 'nord'), 'province'] = 'north'
new_df.loc[(new_df['province'] == 'North '), 'province'] = 'north'
new_df.loc[(new_df['province'] == 'NORD'), 'province'] = 'north'

new_df.loc[(new_df['province'] == 'sourthern'), 'province'] = 'south'
new_df.loc[(new_df['province'] == 'south '), 'province'] = 'south'
new_df.loc[(new_df['province'] == 'sud'), 'province'] = 'south'
new_df.loc[(new_df['province'] == 'SUD'), 'province'] = 'south'
new_df.loc[(new_df['province'] == 'South '), 'province'] = 'south'
new_df.loc[(new_df['province'] == 'SOURTHERN'), 'province'] = 'south'

new_df.loc[(new_df['province'] == 'east '), 'province'] = 'east'
new_df.loc[(new_df['province'] == 'East'), 'province'] = 'east'
new_df.loc[(new_df['province'] == 'EAST'), 'province'] = 'east'
new_df.loc[(new_df['province'] == 'East '), 'province'] = 'east'

new_df.loc[(new_df['province'] == 'kigali city'), 'province'] = 'kigali'
new_df.loc[(new_df['province'] == 'Kigali city'), 'province'] = 'kigali'

new_df.loc[(new_df['province'] == 'west '), 'province'] = 'west'
new_df.loc[(new_df['province'] == 'West '), 'province'] = 'west'
new_df.loc[(new_df['province'] == 'WEST'), 'province'] = 'west'
new_df.province.unique()

array(['north', 'east', 'south', 'west', 'kigali'], dtype=object)

turn `num` & `den` to floats

In [193]:
new_df["num"] = pd.to_numeric(new_df["num"], errors="coerce")
new_df["den"] = pd.to_numeric(new_df["den"], errors="coerce")

clean fiscal year

In [194]:
new_df["fy"] = new_df["fy"].str.replace("FY ", "", regex=False)

remove `prison` & `teaching hospitals` samples

In [195]:
filtered_df = new_df[~new_df['hf_type'].isin(['Prison', 'Teaching Hospital'])]
new_df = filtered_df
new_df = new_df.loc[new_df['hf_name'] != "Butare Chu Hnr (huye)"]
new_df = new_df.loc[new_df['hf_name'] != "kanombe RH"]
new_df.loc[new_df['hf_name'] == 'gwinkwavu DH', 'hf_type'] = 'Hospital (DH,PH,RH)'
new_df.loc[(new_df['hf_tb_type'] == 'CT') & (new_df['hf_type'] == 'Hospital (DH,PH,RH)'), 'hf_type'] = 'Health Center'

In [196]:
gen_df = pd.read_csv("../_data/tb-rsqa_GXP.csv")
gen_df.rename({ "ddd": "hf_type" }, axis=1, inplace=True)
gen_df.loc[:, 'hf_type'] = gen_df['hf_type'].str.lower()
gen_df.loc[:, 'Genexpert.site'] = gen_df['Genexpert.site'].str.lower()
new_df.loc[:, 'hf_type'] = new_df['hf_type'].str.lower()

create new `section` variable: classification of item_name

In [197]:
section_mapping = {
    "Is active case finding conducted in TB high risk groups?[Questions 2, 8 and 9]": "screening",
    "Are TB diagnostics continuously functional (microscopy and Expert) and their results available timely (microscopy, expert and culture)?[Questions 3, 4, 5, 6 and 7]": "diagnosis",
    "Are TB patients early initiated on TB treatment and on ART (if indicated), and their bacteriological control performed according to guidelines?[Questions 16, 17, 18 and 19]": "treatment",
    "Does the BMI monitored for TB Patients and nutritional support provided to the eligible patients?[Questions New_70 and New_71]": "treatment"
}
new_df['section'] = new_df['summary_group'].map(section_mapping)

In [198]:
# create new avg variable
new_df['avg_score'] = (new_df['num'] / new_df['den']) * 100

In [199]:
# consider screening, diagnosis, and treatment sections only
new_df = new_df[new_df["section"].isin(["screening", "treatment", "diagnosis"])]


create a new `genexpert_or_microscopy` variable

In [200]:
genexpert_sites = set(zip(gen_df['Genexpert.site'], gen_df['hf_type']))

def check_facility(row):
    if (row['hf_name'], row['hf_type']) in genexpert_sites:
        return 'yes'
    else:
        return 'no'

new_df['genexpert_or_microscopy'] = new_df.apply(check_facility, axis=1)
new_df.loc[new_df['hf_type'] == 'hospital (dh,ph,rh)', 'genexpert_or_microscopy'] = 'yes'
new_df['genexpert_or_microscopy'].value_counts()

genexpert_or_microscopy
no     4759
yes    2192
Name: count, dtype: int64

Ensure all `hf_name` that are `health centers` are 'CT' TB type

In [201]:
hf_names_to_check = gen_df.iloc[:, 0].unique()
filtered_facilities = new_df[
    (new_df['hf_name'].isin(hf_names_to_check)) &
    (new_df['hf_type'] == "health center") &
    (new_df['hf_tb_type'] == "CT")
]['hf_name'].unique()

In [202]:
new_df.loc[new_df['hf_name'] == 'nyamirama cs', 'hf_tb_type'] = 'CDT'
new_df.loc[new_df['hf_name'] == 'byahi (rubavu) cs', 'hf_tb_type'] = 'CDT'

Data validation

In [203]:
duplicates = new_df[new_df.duplicated()]
print("No. duplicates: ", duplicates.shape[0])
print("No. missing values: ", new_df.isnull().sum())

No. duplicates:  0
No. missing values:  province                      0
district                      0
hf_tb_type                    0
hf_name                       0
hf_type                       0
fy                            0
summary_group                 0
no                            0
normes                     4169
item_name                     0
num                        1987
den                         547
section                       0
avg_score                  2121
genexpert_or_microscopy       0
dtype: int64


excluding `prison related` screening questions

In [204]:
# Define the items to exclude
items_to_exclude = [
    "Sur 10 nouveaux prisonniers entrÃ©s au cours de la pÃ©riode evaluÃ©e, combien ont beneficiÃ© du screening TB Ã¡ l'entrÃ©e",
    "Sur 10 nouveaux prisonniers sortants au cours de la pÃ©riode evaluÃ©e, combien ont beneficiÃ© du screening TB Ã¡ la sortie",
    "Sur les 10 derniers patients hospitalises ( 5 pediatrie,5Medecine interne), combien ont beneficie  du screening de la TB et prise en charge selon le protocole? ",
]
new_df = new_df[~new_df["item_name"].isin(items_to_exclude)]

Split dataset into `valid` & `invalid` datasets

In [205]:
# Define valid cases
valid_df = new_df[
    (new_df["num"] <= new_df["den"]) &  # num must not be greater than den
    ~(new_df["num"].isna() & new_df["den"] > 0) &  # Exclude cases where num is NaN but den > 0
    ~((new_df["num"] == 0) & (new_df["den"] == 0)) &  # Exclude cases where both num and den are 0
    ~(new_df["den"].isna())  # Exclude cases where den is NaN (new fix)
]

invalid_df = new_df[
    (new_df["num"] > new_df["den"]) |  # Invalid if num > den
    ((new_df["num"].isna()) & (new_df["den"] > 0)) |  # Invalid if num is NaN but den > 0
    ((new_df["num"] == 0) & (new_df["den"] == 0)) |  # Invalid if both num and den are 0
    (new_df["den"].isna()) |  # Invalid if den is NaN
    ((new_df["num"].isna()) & (new_df["den"] == 0))  # NEW FIX: Invalid if num is NaN and den = 0
]

new_df = valid_df

#### **Descriptive Analytics**

microscopy only health centers

In [206]:
microscopy_only_df = new_df[(new_df['hf_tb_type'] == 'CDT') & (new_df['hf_type'] == "health center")]
microscopy_only_df.shape

(1372, 15)


`Overall score and count by provice`

In [207]:
# First calculate the overall_score as you were doing
fy_summary = new_df.groupby(["province", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = new_df.groupby(["province", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["province", 'fy'], how="left")
result_df 

Unnamed: 0,province,fy,overall_score,hf_count
0,east,2020-2021,75.606705,34
1,east,2021-2022,77.986851,41
2,east,2023-2024,71.373275,49
3,kigali,2023-2024,74.148114,17
4,north,2020-2021,82.863341,25
5,north,2023-2024,85.105871,39
6,south,2020-2021,81.451843,35
7,south,2021-2022,83.548983,49
8,south,2023-2024,87.867647,9
9,west,2022-2023,76.273803,39


`Overall score and count by 'Health facility type'`

In [209]:
# First calculate the overall_score as you were doing
fy_summary = new_df.groupby(["hf_type", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = new_df.groupby(["hf_type", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["hf_type", 'fy'], how="left")
result_df 

Unnamed: 0,hf_type,fy,overall_score,hf_count
0,health center,2020-2021,77.611523,73
1,health center,2021-2022,80.297115,70
2,health center,2022-2023,74.227865,31
3,health center,2023-2024,71.538048,69
4,"hospital (dh,ph,rh)",2020-2021,84.226415,21
5,"hospital (dh,ph,rh)",2021-2022,87.295401,20
6,"hospital (dh,ph,rh)",2022-2023,82.407407,8
7,"hospital (dh,ph,rh)",2023-2024,84.740305,53


`Overall score and count by 'Tb facility type'`

In [210]:
# First calculate the overall_score as you were doing
fy_summary = new_df.groupby(["hf_tb_type", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = new_df.groupby(["hf_tb_type", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["hf_tb_type", 'fy'], how="left")
result_df 

Unnamed: 0,hf_tb_type,fy,overall_score,hf_count
0,CDT,2020-2021,85.686722,49
1,CDT,2021-2022,82.747115,49
2,CDT,2022-2023,83.803795,20
3,CDT,2023-2024,83.89708,82
4,CT,2020-2021,70.658537,45
5,CT,2021-2022,80.354103,41
6,CT,2022-2023,66.84058,19
7,CT,2023-2024,63.463368,41


`Overall score and count by 'geneXpert_or_microscopy'`

In [212]:
# First calculate the overall_score as you were doing
fy_summary = new_df.groupby(["genexpert_or_microscopy", 'fy']).apply(
    lambda x: (x["num"].sum() / x["den"].sum()) * 100
).reset_index(name="overall_score")

# Now calculate the hf_count by counting unique health facilities in each group
hf_counts = new_df.groupby(["genexpert_or_microscopy", 'fy'])["hf_name"].nunique().reset_index(name="hf_count")

# Merge the two dataframes to get all the columns you need
result_df = pd.merge(fy_summary, hf_counts, on=["genexpert_or_microscopy", 'fy'], how="left")
result_df 

Unnamed: 0,genexpert_or_microscopy,fy,overall_score,hf_count
0,no,2020-2021,77.611523,73
1,no,2021-2022,80.297115,70
2,no,2022-2023,74.227865,31
3,no,2023-2024,69.642857,61
4,yes,2020-2021,84.226415,21
5,yes,2021-2022,87.295401,20
6,yes,2022-2023,82.407407,8
7,yes,2023-2024,84.518726,61
