# Modelling Notebook Step by Step

## Imports and setup

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

from base.baseprocess import DataProcessor
from util.constants import Constants
from util.mapping import CategoricalMapping

In [2]:
# change None to specific number to shorten the display list, 'None' to display the full list
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Load Data

In [3]:
# import data
data = pd.read_csv(Constants.HARMONIZER_FILE_PATH, low_memory=False)

In [4]:
data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,earnings_per_year,currently_smoke_tobacco,age_started_smoking,length_time_smoking,number_tobacco,type_tobacco,age_stopped_smoking,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,reading1_systolic,reading1_diastolic,reading2_systolic,reading2_diastolic,reading3_systolic,reading3_diastolic,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,reading1_bpm,reading2_bpm,reading3_bpm,fasting_blood_glucose,total_cholesterol,urinary_sodium,urinary_creatinine,triglycerides,hdl_cholesterol
0,ethiopia,,,0.0,1,2.0,8.0,2.0,,2.0,,,,,,2.0,1.0,,5.0,0.0,3.0,vigorous-intensity,2.0,7.0,120.0,120.0,2.0,,2.0,,2.0,,2.0,2.0,92.0,71.0,92.0,67.0,97.0,65.0,2.0,2.0,150.5,45.3,68.0,84.0,83.0,81.0,88.0,77.0,113.0,,,52.6,39.0
1,ethiopia,,,0.0,1,2.0,8.0,2.0,,2.0,,,,,,2.0,2.0,,,0.0,3.0,vigorous-intensity,4.0,5.0,30.0,60.0,2.0,,2.0,,2.0,,2.0,2.0,88.0,53.0,84.0,30.0,89.0,53.0,2.0,2.0,155.5,40.5,63.0,84.0,98.0,97.0,102.0,76.0,100.0,,,85.5,36.0
2,ethiopia,,,4.0,2,1.0,5.0,4.0,,2.0,,,,,,2.0,1.0,,3.0,0.0,3.0,moderate-intensity,,3.0,,180.0,2.0,,2.0,,2.0,,2.0,2.0,122.0,79.0,118.0,74.0,99.0,69.0,2.0,2.0,153.0,51.1,64.0,85.0,77.0,74.0,75.0,78.0,112.0,,,61.2,25.0
3,ethiopia,,,0.0,1,2.0,5.0,1.0,,2.0,,,,,,2.0,1.0,,4.0,0.0,3.0,moderate-intensity,,2.0,,150.0,2.0,,2.0,,2.0,,2.0,2.0,110.0,78.0,104.0,75.0,113.0,80.0,2.0,2.0,160.0,43.4,62.0,78.0,77.0,76.0,73.0,70.0,100.0,,,27.1,58.0
4,ethiopia,,,0.0,1,1.0,8.0,3.0,,2.0,,,,,,2.0,2.0,,,0.0,4.0,vigorous-intensity,7.0,7.0,30.0,120.0,2.0,,2.0,,2.0,,2.0,2.0,115.0,82.0,111.0,78.0,107.0,73.0,2.0,2.0,147.0,50.0,70.0,94.0,82.0,92.0,87.0,80.0,100.0,,,61.2,39.0


In [5]:
print(f"Data shape: {data.shape}")

Data shape: (218514, 55)


## Clean Data

In [6]:
# Create the DataProcessor instance and process the data
processor = DataProcessor(data, CategoricalMapping.get_mappings())

In [7]:
# process numeric variables
processed_data = processor.process_numeric_variables(data, Constants.PROCESSOR_NUMERIC_COLS)

In [8]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,earnings_per_year,currently_smoke_tobacco,age_started_smoking,length_time_smoking,number_tobacco,type_tobacco,age_stopped_smoking,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,reading1_systolic,reading1_diastolic,reading2_systolic,reading2_diastolic,reading3_systolic,reading3_diastolic,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,reading1_bpm,reading2_bpm,reading3_bpm,fasting_blood_glucose,total_cholesterol,urinary_sodium,urinary_creatinine,triglycerides,hdl_cholesterol
0,ethiopia,,,0.0,1,2.0,8.0,2.0,,2.0,,,,,,2.0,1.0,,5.0,0.0,3.0,vigorous-intensity,2.0,7.0,120.0,120.0,2.0,,2.0,,2.0,,2.0,2.0,92.0,71.0,92.0,67.0,97.0,65.0,2.0,2.0,150.5,45.3,68.0,84.0,83.0,81.0,88.0,77.0,113.0,,,52.6,39.0
2,ethiopia,,,4.0,2,1.0,5.0,4.0,,2.0,,,,,,2.0,1.0,,3.0,0.0,3.0,moderate-intensity,,3.0,,180.0,2.0,,2.0,,2.0,,2.0,2.0,122.0,79.0,118.0,74.0,99.0,69.0,2.0,2.0,153.0,51.1,64.0,85.0,77.0,74.0,75.0,78.0,112.0,,,61.2,25.0
3,ethiopia,,,0.0,1,2.0,5.0,1.0,,2.0,,,,,,2.0,1.0,,4.0,0.0,3.0,moderate-intensity,,2.0,,150.0,2.0,,2.0,,2.0,,2.0,2.0,110.0,78.0,104.0,75.0,113.0,80.0,2.0,2.0,160.0,43.4,62.0,78.0,77.0,76.0,73.0,70.0,100.0,,,27.1,58.0
4,ethiopia,,,0.0,1,1.0,8.0,3.0,,2.0,,,,,,2.0,2.0,,,0.0,4.0,vigorous-intensity,7.0,7.0,30.0,120.0,2.0,,2.0,,2.0,,2.0,2.0,115.0,82.0,111.0,78.0,107.0,73.0,2.0,2.0,147.0,50.0,70.0,94.0,82.0,92.0,87.0,80.0,100.0,,,61.2,39.0
5,ethiopia,,,4.0,2,1.0,7.0,4.0,,2.0,,,,,,2.0,1.0,,4.0,4.0,3.0,vigorous-intensity,5.0,3.0,90.0,60.0,2.0,,2.0,,2.0,,2.0,2.0,110.0,69.0,94.0,67.0,98.0,65.0,2.0,,158.5,42.6,65.0,81.0,71.0,78.0,83.0,73.0,100.0,,,78.2,25.0


In [9]:
print(f"Shape of data (numeric cleaning): {processed_data.shape}")

Shape of data (numeric cleaning): (190575, 55)


In [10]:
# process categorical variables
processed_data = processor.process_categorical_variables(processed_data, Constants.PROCESSOR_CATEGORICAL_COLS)

In [11]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,earnings_per_year,currently_smoke_tobacco,age_started_smoking,length_time_smoking,number_tobacco,type_tobacco,age_stopped_smoking,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,reading1_systolic,reading1_diastolic,reading2_systolic,reading2_diastolic,reading3_systolic,reading3_diastolic,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,reading1_bpm,reading2_bpm,reading3_bpm,fasting_blood_glucose,total_cholesterol,urinary_sodium,urinary_creatinine,triglycerides,hdl_cholesterol
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,,no,,,,missing,,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,92.0,71.0,92.0,67.0,97.0,65.0,no,no,150.5,45.3,68.0,84.0,83.0,81.0,88.0,77.0,113.0,,,52.6,39.0
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,,no,,,,missing,,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,122.0,79.0,118.0,74.0,99.0,69.0,no,no,153.0,51.1,64.0,85.0,77.0,74.0,75.0,78.0,112.0,,,61.2,25.0
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,,no,,,,missing,,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,110.0,78.0,104.0,75.0,113.0,80.0,no,no,160.0,43.4,62.0,78.0,77.0,76.0,73.0,70.0,100.0,,,27.1,58.0
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,,no,,,,missing,,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,115.0,82.0,111.0,78.0,107.0,73.0,no,no,147.0,50.0,70.0,94.0,82.0,92.0,87.0,80.0,100.0,,,61.2,39.0
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,,no,,,,missing,,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,110.0,69.0,94.0,67.0,98.0,65.0,no,missing,158.5,42.6,65.0,81.0,71.0,78.0,83.0,73.0,100.0,,,78.2,25.0


In [12]:
print(f"Shape of data (categorical cleaning): {processed_data.shape}")

Shape of data (categorical cleaning): (190575, 55)


In [13]:
processed_data.to_csv("data/processed/processed.csv", index=False)

## Preprocess Columns and Create Targets

In [14]:
# preprocess columns and create targets
processed_data = processor.preproc_cols_and_create_targets(processed_data)

In [15]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,earnings_per_year,currently_smoke_tobacco,age_started_smoking,length_time_smoking,number_tobacco,type_tobacco,age_stopped_smoking,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,fasting_blood_glucose,total_cholesterol,urinary_sodium,urinary_creatinine,triglycerides,hdl_cholesterol,reading_bpm,blood_pressure
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,,no,,,,missing,,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,150.5,45.3,68.0,84.0,77.0,113.0,,,52.6,39.0,84.0,normal
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,,no,,,,missing,,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,no,no,153.0,51.1,64.0,85.0,78.0,112.0,,,61.2,25.0,75.333333,normal
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,,no,,,,missing,,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,no,no,160.0,43.4,62.0,78.0,70.0,100.0,,,27.1,58.0,75.333333,normal
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,,no,,,,missing,,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,147.0,50.0,70.0,94.0,80.0,100.0,,,61.2,39.0,87.0,normal
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,,no,,,,missing,,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,no,missing,158.5,42.6,65.0,81.0,73.0,100.0,,,78.2,25.0,77.333333,normal


In [16]:
print(f"Preprocess columns and create targets: {processed_data.shape}")

Preprocess columns and create targets: (184674, 48)


In [17]:
# remove column with more than 80% missing value
processed_data = processor.remove_columns_with_missing_values(processed_data)

In [18]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,currently_smoke_tobacco,type_tobacco,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,fasting_blood_glucose,total_cholesterol,triglycerides,hdl_cholesterol,reading_bpm,blood_pressure
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,no,missing,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,150.5,45.3,68.0,84.0,77.0,113.0,52.6,39.0,84.0,normal
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,no,missing,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,no,no,153.0,51.1,64.0,85.0,78.0,112.0,61.2,25.0,75.333333,normal
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,no,missing,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,no,no,160.0,43.4,62.0,78.0,70.0,100.0,27.1,58.0,75.333333,normal
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,no,missing,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,147.0,50.0,70.0,94.0,80.0,100.0,61.2,39.0,87.0,normal
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,no,missing,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,no,missing,158.5,42.6,65.0,81.0,73.0,100.0,78.2,25.0,77.333333,normal


In [19]:
print(f"Remove column with more than 80% missing value: {processed_data.shape}")

Remove column with more than 80% missing value: (184674, 41)


In [20]:
# Save the processed data for baseline characterstic calculation to the output file
processed_data.to_csv(Constants.BASELINE_FILE_PATH, index=False)
print(f"Baseline data saved to {Constants.BASELINE_FILE_PATH}")

Baseline data saved to data/processed/baseline.csv


In [21]:
# Save the processed data to the output file
processed_data.to_csv(Constants.PROCESSOR_FILE_PATH, index=False)
print(f"Processed data saved to {Constants.PROCESSOR_FILE_PATH}")

Processed data saved to data/processed/processed.csv


In [22]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,currently_smoke_tobacco,type_tobacco,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,fasting_blood_glucose,total_cholesterol,triglycerides,hdl_cholesterol,reading_bpm,blood_pressure
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,no,missing,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,150.5,45.3,68.0,84.0,77.0,113.0,52.6,39.0,84.0,normal
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,no,missing,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,no,no,153.0,51.1,64.0,85.0,78.0,112.0,61.2,25.0,75.333333,normal
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,no,missing,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,no,no,160.0,43.4,62.0,78.0,70.0,100.0,27.1,58.0,75.333333,normal
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,no,missing,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,147.0,50.0,70.0,94.0,80.0,100.0,61.2,39.0,87.0,normal
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,no,missing,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,no,missing,158.5,42.6,65.0,81.0,73.0,100.0,78.2,25.0,77.333333,normal


## Class Balancing

In [23]:
def calculate_class_distribution(df, group_col, target_col):
    """
    Calculates the class distribution of a target column, grouped by another column.

    Parameters:
    - df: pandas DataFrame, the dataset containing the columns.
    - group_col: str, the name of the column to group by.
    - target_col: str, the name of the target variable column.

    Returns:
    - pandas DataFrame containing the class distribution for each category of the target column, 
      grouped by the specified group column.
    """

    # Calculate the class distribution grouped by the specified column
    class_distribution = df.groupby([group_col, target_col]).size().unstack(fill_value=0)

    # Reset the index and remove the index name for better formatting
    class_distribution.reset_index(inplace=True)
    class_distribution.columns.name = None

    return class_distribution


In [24]:
class_dist = calculate_class_distribution(processed_data, 'country', 'blood_pressure')

class_dist.head()

Unnamed: 0,country,high,normal
0,afghanistan,2015,1588
1,algeria,3114,2991
2,armenia,1285,538
3,azerbaijan,1601,894
4,bahamas,872,483


In [25]:
# Group by 'country' and get summary statistics for numeric variables
summary = processed_data.groupby('country').describe()

In [26]:
summary

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,years_at_school,years_at_school,years_at_school,years_at_school,years_at_school,years_at_school,years_at_school,years_at_school,ppl_in_household,ppl_in_household,ppl_in_household,ppl_in_household,ppl_in_household,ppl_in_household,ppl_in_household,ppl_in_household,number_alcoholic_drinks,number_alcoholic_drinks,number_alcoholic_drinks,number_alcoholic_drinks,number_alcoholic_drinks,number_alcoholic_drinks,number_alcoholic_drinks,number_alcoholic_drinks,number_daily_fruit_vegetables,number_daily_fruit_vegetables,number_daily_fruit_vegetables,number_daily_fruit_vegetables,number_daily_fruit_vegetables,number_daily_fruit_vegetables,number_daily_fruit_vegetables,number_daily_fruit_vegetables,days_vigorous_exercise,days_vigorous_exercise,days_vigorous_exercise,days_vigorous_exercise,days_vigorous_exercise,days_vigorous_exercise,days_vigorous_exercise,days_vigorous_exercise,days_moderate_exercise,days_moderate_exercise,days_moderate_exercise,days_moderate_exercise,days_moderate_exercise,days_moderate_exercise,days_moderate_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_walking_bicycling_minutes,time_walking_bicycling_minutes,time_walking_bicycling_minutes,time_walking_bicycling_minutes,time_walking_bicycling_minutes,time_walking_bicycling_minutes,time_walking_bicycling_minutes,time_sedentary,time_sedentary,time_sedentary,time_sedentary,time_sedentary,time_sedentary,time_sedentary,time_sedentary,height,height,height,height,height,height,height,height,weight,weight,weight,weight,weight,weight,weight,weight,waist_circumference,waist_circumference,waist_circumference,waist_circumference,waist_circumference,waist_circumference,waist_circumference,waist_circumference,hip_circumference,hip_circumference,hip_circumference,hip_circumference,hip_circumference,hip_circumference,hip_circumference,hip_circumference,fasting_blood_glucose,fasting_blood_glucose,fasting_blood_glucose,fasting_blood_glucose,fasting_blood_glucose,fasting_blood_glucose,fasting_blood_glucose,fasting_blood_glucose,total_cholesterol,total_cholesterol,total_cholesterol,total_cholesterol,total_cholesterol,total_cholesterol,total_cholesterol,total_cholesterol,triglycerides,triglycerides,triglycerides,triglycerides,triglycerides,triglycerides,triglycerides,triglycerides,hdl_cholesterol,hdl_cholesterol,hdl_cholesterol,hdl_cholesterol,hdl_cholesterol,hdl_cholesterol,hdl_cholesterol,hdl_cholesterol,reading_bpm,reading_bpm,reading_bpm,reading_bpm,reading_bpm,reading_bpm,reading_bpm,reading_bpm
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2,Unnamed: 88_level_2,Unnamed: 89_level_2,Unnamed: 90_level_2,Unnamed: 91_level_2,Unnamed: 92_level_2,Unnamed: 93_level_2,Unnamed: 94_level_2,Unnamed: 95_level_2,Unnamed: 96_level_2,Unnamed: 97_level_2,Unnamed: 98_level_2,Unnamed: 99_level_2,Unnamed: 100_level_2,Unnamed: 101_level_2,Unnamed: 102_level_2,Unnamed: 103_level_2,Unnamed: 104_level_2,Unnamed: 105_level_2,Unnamed: 106_level_2,Unnamed: 107_level_2,Unnamed: 108_level_2,Unnamed: 109_level_2,Unnamed: 110_level_2,Unnamed: 111_level_2,Unnamed: 112_level_2,Unnamed: 113_level_2,Unnamed: 114_level_2,Unnamed: 115_level_2,Unnamed: 116_level_2,Unnamed: 117_level_2,Unnamed: 118_level_2,Unnamed: 119_level_2,Unnamed: 120_level_2,Unnamed: 121_level_2,Unnamed: 122_level_2,Unnamed: 123_level_2,Unnamed: 124_level_2,Unnamed: 125_level_2,Unnamed: 126_level_2,Unnamed: 127_level_2,Unnamed: 128_level_2,Unnamed: 129_level_2,Unnamed: 130_level_2,Unnamed: 131_level_2,Unnamed: 132_level_2,Unnamed: 133_level_2,Unnamed: 134_level_2,Unnamed: 135_level_2,Unnamed: 136_level_2,Unnamed: 137_level_2,Unnamed: 138_level_2,Unnamed: 139_level_2,Unnamed: 140_level_2,Unnamed: 141_level_2,Unnamed: 142_level_2,Unnamed: 143_level_2,Unnamed: 144_level_2
afghanistan,3576.0,36.960291,14.490164,16.0,24.0,35.0,47.0,70.0,3603.0,4.197891,5.392323,0.0,0.0,0.0,9.0,24.0,3603.0,4.812934,2.669356,1.0,3.0,4.0,6.0,26.0,15.0,4.466667,1.884776,1.0,3.0,5.0,6.0,7.0,3603.0,15.530114,48.678493,0.0,7.0,11.0,16.0,1078.0,804.0,4.279851,1.869342,1.0,3.0,4.0,6.0,7.0,2070.0,4.491787,2.104262,1.0,3.0,5.0,7.0,7.0,2197.0,75.491124,90.480203,10.0,30.0,60.0,90.0,925.0,3603.0,401.845684,237.450172,0.0,180.0,360.0,600.0,960.0,3435.0,162.089042,10.26161,101.0,155.0,162.0,169.5,253.6,3435.0,66.026934,14.684592,23.1,55.5,65.0,75.0,166.3,3435.0,87.373051,17.317103,30.0,77.0,88.0,98.0,198.0,0.0,,,,,,,,3320.0,87.756325,28.349611,20.0,74.0,84.0,96.0,226.0,3377.0,150.411226,42.383609,77.0,116.0,144.0,177.0,393.0,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,
algeria,6105.0,40.989681,13.105607,18.0,31.0,40.0,51.0,69.0,6102.0,8.901835,5.426812,0.0,6.0,9.0,13.0,30.0,6102.0,3.581776,2.021548,1.0,2.0,3.0,5.0,19.0,167.0,4.562874,1.577496,1.0,4.0,5.0,6.0,6.0,6105.0,28.125471,62.738982,0.0,10.0,17.0,28.0,1078.0,1245.0,4.174297,2.139278,1.0,2.0,5.0,6.0,7.0,2762.0,4.401159,2.192437,1.0,2.0,5.0,7.0,7.0,4615.0,77.027519,68.978615,10.0,30.0,60.0,120.0,900.0,6095.0,187.03708,158.260186,10.0,70.0,120.0,240.0,960.0,5948.0,165.790249,9.890685,100.0,159.0,165.0,172.0,205.0,5947.0,73.768455,14.97327,25.0,63.0,72.8,83.0,170.0,5945.0,93.958503,20.729446,60.0,84.0,93.0,102.0,888.8,5945.0,104.362675,12.823812,61.0,96.0,104.0,113.0,164.0,5480.0,96.014964,24.272192,20.6,82.0,92.0,103.0,229.0,5692.0,166.314775,41.490147,53.0,137.0,162.0,190.0,777.0,5692.0,102.861402,61.433104,0.8,59.0,86.0,127.0,541.0,0.0,,,,,,,,6105.0,77.608845,11.266613,40.0,69.666667,77.0,84.333333,129.666667
armenia,1823.0,44.024136,14.646005,18.0,31.0,45.0,56.0,69.0,1823.0,11.847504,2.56203,0.0,10.0,12.0,14.0,25.0,1823.0,2.748217,1.242194,1.0,2.0,2.0,4.0,11.0,960.0,5.44375,1.022664,1.0,5.0,6.0,6.0,6.0,1823.0,34.172244,87.997517,0.0,12.0,21.0,35.0,1078.0,243.0,5.312757,2.110805,1.0,3.5,7.0,7.0,7.0,888.0,5.816441,1.955849,1.0,5.0,7.0,7.0,7.0,1216.0,135.902138,143.507134,10.0,30.0,60.0,180.0,960.0,1823.0,217.221613,169.998716,0.0,90.0,180.0,300.0,960.0,1797.0,162.090373,8.879035,110.0,156.0,161.0,168.0,190.0,1797.0,70.750195,15.95409,22.0,60.0,69.0,80.0,140.0,1797.0,92.052588,15.358131,42.0,81.0,91.0,102.0,183.0,1797.0,103.450139,13.392393,50.0,95.0,102.0,112.0,156.0,1478.0,4.858024,3.781046,1.1,3.69,4.4,5.1,77.0,1493.0,4.589424,3.459241,2.59,3.59,4.36,5.19,77.0,0.0,,,,,,,,1493.0,1.299282,3.416928,0.39,0.9,1.1,1.34,77.0,1823.0,76.993235,9.872168,36.333333,70.666667,76.0,82.666667,111.666667
azerbaijan,2495.0,44.83006,13.937798,18.0,33.0,46.0,57.0,69.0,2495.0,11.602806,2.690488,0.0,10.0,11.0,14.0,26.0,2495.0,2.675752,1.204223,1.0,2.0,2.0,3.0,18.0,451.0,5.226164,1.071791,1.0,5.0,6.0,6.0,7.0,2495.0,25.192786,21.337347,0.0,14.0,21.0,33.0,548.0,250.0,4.708,2.009699,1.0,3.0,5.0,7.0,7.0,1189.0,5.846089,1.650129,1.0,5.0,7.0,7.0,7.0,2057.0,97.415168,87.693758,10.0,40.0,60.0,120.0,900.0,2495.0,202.994389,122.610185,0.0,120.0,180.0,270.0,960.0,2473.0,165.129357,8.778807,115.0,159.0,165.0,171.0,198.0,2473.0,74.627335,14.749574,36.0,65.0,74.0,83.5,160.0,2473.0,92.222806,15.191446,32.0,82.0,92.0,101.0,165.0,2473.0,103.716619,14.591616,45.0,95.0,103.0,112.0,165.0,2405.0,5.31748,2.636624,1.1,4.3,4.8,5.5,77.0,2428.0,4.71759,3.491757,2.59,3.75,4.49,5.23,77.0,0.0,,,,,,,,2428.0,1.277061,3.456188,0.3,0.91,1.08,1.3,77.0,2495.0,77.139613,9.644739,41.333333,70.666667,77.0,82.333333,121.333333
bahamas,1355.0,41.720295,10.680571,24.0,33.0,41.0,50.0,64.0,1347.0,12.838901,2.895152,0.0,12.0,12.0,14.0,30.0,1355.0,2.101845,1.074735,1.0,1.0,2.0,3.0,10.0,675.0,3.755556,1.135263,1.0,3.0,4.0,5.0,5.0,1355.0,17.656827,27.573769,0.0,7.0,14.0,22.0,693.0,235.0,5.038298,1.234572,2.0,5.0,5.0,6.0,7.0,402.0,4.514925,1.644465,1.0,3.0,5.0,5.0,7.0,408.0,82.289216,92.337458,10.0,30.0,60.0,90.0,600.0,0.0,,,,,,,,1349.0,166.638992,9.738083,115.0,160.0,167.0,172.0,203.0,1350.0,84.303653,19.408387,27.863636,70.931818,81.818182,95.454545,166.954545,1340.0,79.09153,57.809753,27.0,41.475,86.05,97.0,888.0,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,1355.0,83.511316,10.614451,51.0,77.0,83.666667,90.333333,121.666667
bangladesh,7435.0,38.493208,12.221657,18.0,29.0,37.0,47.0,69.0,7435.0,5.795831,4.773228,0.0,0.0,5.0,9.0,20.0,0.0,,,,,,,,277.0,5.465704,1.11141,1.0,5.0,6.0,6.0,6.0,7435.0,21.108098,36.018128,0.0,8.0,14.5,24.0,616.0,1937.0,4.529685,2.047795,1.0,3.0,5.0,7.0,7.0,5848.0,5.844733,1.812439,1.0,5.0,7.0,7.0,7.0,4586.0,68.58068,76.18804,10.0,30.0,45.0,90.0,960.0,7435.0,167.968393,143.129377,0.0,60.0,120.0,240.0,960.0,7299.0,156.611226,9.047358,100.199997,150.0,156.0,163.0,210.600006,7299.0,56.792711,11.707385,28.0,49.0,56.0,64.0,162.0,7299.0,80.096616,11.15673,30.0,72.0,79.0,88.0,168.0,7299.0,91.704776,9.057153,45.0,85.0,91.0,97.0,185.0,6402.0,95.809435,22.413323,0.0,84.0,92.0,101.0,229.0,6527.0,168.212655,44.452533,0.0,143.0,167.0,193.0,424.0,6527.0,139.582503,80.078026,0.0,84.0,121.0,176.0,540.0,6527.0,38.242225,10.551263,0.0,33.0,38.0,44.0,138.0,7435.0,77.679444,11.842708,33.0,69.333333,77.0,85.333333,131.333333
barbados,298.0,53.493289,14.107629,25.0,44.0,54.0,64.0,80.0,298.0,12.328859,3.948222,3.0,10.0,12.0,14.0,30.0,298.0,2.053691,1.248758,1.0,1.0,2.0,2.0,10.0,114.0,1.342105,1.103726,1.0,1.0,1.0,1.0,9.0,298.0,19.436242,59.606151,0.0,8.0,13.0,19.0,700.0,30.0,4.333333,1.953482,1.0,3.0,5.0,5.0,9.0,81.0,4.37037,1.833333,1.0,3.0,5.0,5.0,9.0,131.0,65.877863,90.372053,10.0,30.0,40.0,60.0,600.0,298.0,232.855705,167.994316,10.0,120.0,180.0,300.0,1080.0,298.0,165.765101,10.463117,117.0,158.5,165.5,173.0,193.0,298.0,76.891611,17.308285,40.0,65.125,75.0,85.4,159.0,298.0,103.813087,105.511823,55.0,84.0,91.0,99.0,999.9,0.0,,,,,,,,298.0,7.470302,14.463721,2.7,4.3,5.0,5.4,99.99,298.0,7.853523,16.313857,2.99,4.3825,4.94,5.725,99.99,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,
belarus,4895.0,46.400817,13.454821,17.0,36.0,48.0,57.0,71.0,4895.0,12.831665,2.522935,0.0,11.0,12.0,15.0,27.0,0.0,,,,,,,,3908.0,5.344678,0.865799,1.0,5.0,6.0,6.0,6.0,4895.0,28.013075,34.79981,0.0,13.0,21.0,35.0,1078.0,564.0,5.092199,1.359922,1.0,5.0,5.0,6.0,7.0,1749.0,5.133219,1.265625,1.0,5.0,5.0,6.0,7.0,4106.0,96.703117,97.970454,10.0,40.0,60.0,120.0,720.0,4895.0,292.545455,168.784997,0.0,180.0,240.0,365.5,960.0,4872.0,168.71053,8.920688,104.3,163.0,168.0,175.0,200.0,4872.0,78.272114,15.86613,22.0,67.0,77.0,88.2,167.0,4872.0,95.60897,65.060832,53.0,80.0,90.0,100.2,888.0,0.0,,,,,,,,4665.0,4.794049,1.402283,1.1,4.1,4.6,5.2,30.3,4709.0,4.869471,1.075868,2.59,4.13,4.79,5.55,10.36,0.0,,,,,,,,4709.0,1.383436,0.405466,0.3,1.1,1.31,1.61,2.59,4895.0,74.207354,9.796947,40.666667,68.0,73.333333,79.666667,129.0
benin,4449.0,37.04383,12.729092,18.0,27.0,35.0,46.0,69.0,4449.0,4.308384,5.291649,0.0,0.0,0.0,8.0,30.0,4449.0,2.06878,1.203352,1.0,1.0,2.0,2.0,15.0,1629.0,4.022713,1.739864,1.0,3.0,4.0,5.0,6.0,4449.0,16.425489,19.294816,0.0,7.0,12.0,21.0,693.0,1020.0,5.412745,1.634765,1.0,5.0,6.0,7.0,7.0,4448.0,3.223921,2.932605,0.0,0.0,4.0,6.0,7.0,3537.0,88.730845,89.683165,10.0,30.0,60.0,120.0,960.0,4446.0,222.795547,180.08085,10.0,120.0,180.0,300.0,960.0,4225.0,163.771124,8.3705,121.0,158.0,164.0,170.0,198.0,4232.0,63.018195,12.753648,30.0,55.0,61.0,69.25,167.0,4209.0,82.034688,10.857145,43.0,74.0,80.0,88.0,150.0,4203.0,93.780633,10.821512,50.0,87.0,92.0,99.0,155.0,4405.0,86.232917,22.447542,41.0,71.0,84.0,96.0,227.0,4396.0,152.435851,44.764795,74.0,118.0,144.0,178.0,450.0,0.0,,,,,,,,0.0,,,,,,,,4449.0,74.690792,11.733726,38.0,67.0,74.0,82.0,151.333333
bhutan,5209.0,39.653292,13.312516,15.0,30.0,38.0,49.0,69.0,5209.0,4.856018,5.673395,0.0,0.0,1.0,10.0,25.0,5209.0,3.553273,1.668095,1.0,2.0,3.0,5.0,18.0,2264.0,4.400618,1.627161,1.0,4.0,5.0,6.0,6.0,5209.0,23.049453,27.93589,0.0,12.0,21.0,27.0,952.0,2187.0,4.806127,2.173385,1.0,3.0,5.0,7.0,7.0,4649.0,5.686814,1.917848,1.0,4.0,7.0,7.0,7.0,3270.0,77.933333,90.675591,10.0,30.0,60.0,120.0,900.0,5209.0,140.680937,114.291445,0.0,60.0,120.0,180.0,960.0,5145.0,156.017901,8.290649,103.7,150.0,155.5,161.5,192.1,5145.0,61.95895,11.763451,24.0,53.5,60.9,69.0,161.3,5145.0,83.525851,11.177721,42.0,76.0,83.0,91.0,188.0,5145.0,94.470282,8.916634,58.0,89.0,94.0,100.0,179.0,5100.0,81.754902,16.152888,20.0,74.0,80.0,88.0,225.0,5100.0,144.196471,38.386882,77.0,115.0,138.0,167.0,400.0,5100.0,77.111373,3.302454,50.0,77.0,77.0,77.0,181.0,5100.0,42.814706,13.315517,15.0,34.0,40.0,49.0,100.0,5209.0,78.6252,12.184838,40.666667,70.333333,77.333333,85.666667,154.666667


In [27]:
# Add split column
# Correcting the method to add a 'split' column
processed_data['split'] = processed_data.groupby('country')['country'] \
    .transform(lambda x: np.random.choice(['TRAIN', 'TEST'], size=len(x), p=[0.8, 0.2]))


In [28]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,currently_smoke_tobacco,type_tobacco,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,fasting_blood_glucose,total_cholesterol,triglycerides,hdl_cholesterol,reading_bpm,blood_pressure,split
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,no,missing,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,150.5,45.3,68.0,84.0,77.0,113.0,52.6,39.0,84.0,normal,TRAIN
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,no,missing,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,no,no,153.0,51.1,64.0,85.0,78.0,112.0,61.2,25.0,75.333333,normal,TRAIN
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,no,missing,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,no,no,160.0,43.4,62.0,78.0,70.0,100.0,27.1,58.0,75.333333,normal,TRAIN
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,no,missing,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,147.0,50.0,70.0,94.0,80.0,100.0,61.2,39.0,87.0,normal,TRAIN
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,no,missing,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,no,missing,158.5,42.6,65.0,81.0,73.0,100.0,78.2,25.0,77.333333,normal,TRAIN


In [29]:
split_counts = processed_data.groupby(['country', 'split']).size().unstack(fill_value=0)

In [30]:
split_counts

split,TEST,TRAIN
country,Unnamed: 1_level_1,Unnamed: 2_level_1
afghanistan,727,2876
algeria,1166,4939
armenia,348,1475
azerbaijan,482,2013
bahamas,242,1113
bangladesh,1471,5964
barbados,56,242
belarus,979,3916
benin,921,3528
bhutan,1088,4121


In [31]:
processed_data.columns

Index(['country', 'sex', 'age', 'years_at_school', 'level_of_education',
       'marital_status', 'work_status', 'ppl_in_household',
       'currently_smoke_tobacco', 'type_tobacco', 'smoke_home_workplace',
       'consumed_alcohol', 'quit_drinking_for_health',
       'number_alcoholic_drinks', 'number_daily_fruit_vegetables',
       'salt_consumption', 'work_intensity', 'days_vigorous_exercise',
       'days_moderate_exercise', 'time_walking_bicycling_minutes',
       'time_sedentary', 'had_blood_pressure_measurement',
       'taken_drugs_for_raised_bp', 'had_blood_sugar_measurement',
       'taken_diabetes_drugs', 'had_cholesterol_measurement',
       'taken_cholesterol_oral_treatment', 'had_heart_attack',
       'taking_heart_disease_medication', 'treated_for_raised_bp',
       'are_you_pregnant', 'height', 'weight', 'waist_circumference',
       'hip_circumference', 'fasting_blood_glucose', 'total_cholesterol',
       'triglycerides', 'hdl_cholesterol', 'reading_bpm', 'blood_pressu

In [32]:
# Counting the number of rows for each country
country_counts = processed_data['country'].value_counts()

# Calculating the total number of rows
total_rows = len(processed_data)

# Calculating the cumulative percentage of data represented by each country
country_cumulative_percent = country_counts.cumsum() / total_rows

# Selecting countries comprising 80% of the data
selected_countries = country_cumulative_percent[country_cumulative_percent <= 0.8].index.tolist()
selected_countries

['ethiopia',
 'bangladesh',
 'myanmar',
 'rwanda',
 'palestine',
 'eritrea',
 'algeria',
 'nepal',
 'mongolia',
 'bhutan',
 'madagascar',
 'tanzania',
 'belarus',
 'benin',
 'ecuador',
 'vanuatu',
 'comoros',
 'georgia',
 'moldova',
 'tonga',
 'afghanistan',
 'uganda',
 'togo',
 'botswana',
 'zambia',
 'malawi',
 'libya',
 'namibia',
 'gambia',
 'eswatini',
 'mozambique',
 'kyrgyzstan']

In [33]:
unique_countries = processed_data['country'].unique()
len(unique_countries)

57

In [34]:
len(selected_countries)

32

In [35]:
processed_data.shape

(184674, 42)

In [36]:
data = processed_data
# data = processed_data[processed_data['country'].isin(selected_countries)]

In [37]:
# class count
class_count = processed_data.groupby(['blood_pressure']).size()#.unstack(fill_value=0)
class_count

blood_pressure
high      105677
normal     78997
dtype: int64

In [38]:
# class count
class_count = processed_data.groupby(['country', 'split', 'blood_pressure']).size().unstack(fill_value=0)
class_count

Unnamed: 0_level_0,blood_pressure,high,normal
country,split,Unnamed: 2_level_1,Unnamed: 3_level_1
afghanistan,TEST,433,294
afghanistan,TRAIN,1582,1294
algeria,TEST,582,584
algeria,TRAIN,2532,2407
armenia,TEST,253,95
armenia,TRAIN,1032,443
azerbaijan,TEST,300,182
azerbaijan,TRAIN,1301,712
bahamas,TEST,155,87
bahamas,TRAIN,717,396


## Global Modelling

For the code below, the results weren't so great when we used `cross_validate`.

In [41]:
# from sklearn.model_selection import cross_validate
# from sklearn.calibration import LabelEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.discriminant_analysis import StandardScaler
# from sklearn.impute import SimpleImputer
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder
# from xgboost import XGBClassifier
# import pandas as pd
# import numpy as np

# col_to_drop = []
# X = data.drop(columns=['blood_pressure'] + col_to_drop).sample(frac=1, random_state=42)
# y = data['blood_pressure'].reindex(X.index)

# # Use the 'split' column to separate training and validation sets
# train_indices = data[data['split'] == 'TRAIN'].index
# valid_indices = data[data['split'] == 'TEST'].index
# X_train, y_train = X.loc[train_indices], y.loc[train_indices]
# X_valid, y_valid = X.loc[valid_indices], y.loc[valid_indices]
# print("Finished sorting columns")

# # Preprocessing steps
# categorical_features = X.columns[X.dtypes == 'object'].tolist()
# numeric_features = X.columns[X.dtypes != 'object'].tolist()

# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)])

# # Encode labels
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_valid_encoded = label_encoder.transform(y_valid)

# # Preprocess the data
# X_train_processed = preprocessor.fit_transform(X_train)
# X_valid_processed = preprocessor.transform(X_valid)
# print("Finished processor scaling")

# # Count the occurrences of each class
# unique, counts = np.unique(y_train_encoded, return_counts=True)
# class_counts = dict(zip(unique, counts))

# # Calculate class weights
# total_instances = len(y_train_encoded)
# class_weights = {cls: total_instances/count for cls, count in class_counts.items()}

# # Print the shapes of all the splits
# print("X_train shape:", X_train_processed.shape)
# print("X_test shape:", X_valid_processed.shape)
# print("y_train_encoded shape:", y_train_encoded.shape)
# print("y_test_encoded shape:", y_valid_encoded.shape)

# # Models to train
# models = {
#     'Logistic Regression': LogisticRegression(),
#     'KNN': KNeighborsClassifier(),
#     'Random Forest': RandomForestClassifier(),
#     'XGBoost': XGBClassifier()
# }

# # Results table
# results = pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Accuracy Std', 'Precision', 'Precision Std',
#                                 'Recall', 'Recall Std', 'F1 Score', 'F1 Score Std'])

# # Train and evaluate models using cross-validation
# cv = 5  # Number of cross-validation folds
# scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# for name, model in models.items():
#     # Perform cross-validation
#     cv_scores = cross_validate(model, X_train_processed, y_train_encoded, cv=cv, scoring=scoring)
    
#     # Compute mean and standard deviation of metrics
#     accuracy_mean = np.mean(cv_scores['test_accuracy'])
#     accuracy_std = np.std(cv_scores['test_accuracy'])
#     precision_mean = np.mean(cv_scores['test_precision_macro'])
#     precision_std = np.std(cv_scores['test_precision_macro'])
#     recall_mean = np.mean(cv_scores['test_recall_macro'])
#     recall_std = np.std(cv_scores['test_recall_macro'])
#     f1_mean = np.mean(cv_scores['test_f1_macro'])
#     f1_std = np.std(cv_scores['test_f1_macro'])
    
#     # Add to results
#     new_row = pd.DataFrame({
#         'Model': name,
#         'Dataset': 'Global',
#         'Accuracy': accuracy_mean,
#         'Accuracy Std': accuracy_std,
#         'Precision': precision_mean,
#         'Precision Std': precision_std,
#         'Recall': recall_mean,
#         'Recall Std': recall_std,
#         'F1 Score': f1_mean,
#         'F1 Score Std': f1_std
#     }, index=[0])
#     results = pd.concat([results, new_row], ignore_index=True)

Finished sorting columns
Finished processor scaling
X_train shape: (147654, 154)
X_test shape: (37020, 154)
y_train_encoded shape: (147654,)
y_test_encoded shape: (37020,)


In [43]:
# from sklearn.model_selection import train_test_split
from sklearn.calibration import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import pandas as pd

col_to_drop = []
X = data.drop(columns=['blood_pressure'] + col_to_drop).sample(frac=1, random_state=42)
y = data['blood_pressure'].reindex(X.index)

# Use the 'split' column to separate training and validation sets
train_indices = data[data['split'] == 'TRAIN'].index
valid_indices = data[data['split'] == 'TEST'].index
X_train, y_train = X.loc[train_indices], y.loc[train_indices]
X_valid, y_valid = X.loc[valid_indices], y.loc[valid_indices]

print("Finished sorting columns")


# Preprocessing steps
categorical_features = X.columns[X.dtypes == 'object'].tolist()
numeric_features = X.columns[X.dtypes != 'object'].tolist()


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)

print("Finished processor scaling")

# Count the occurrences of each class
unique, counts = np.unique(y_train_encoded, return_counts=True)
class_counts = dict(zip(unique, counts))


# Calculate class weights
total_instances = len(y_train_encoded)
class_weights = {cls: total_instances/count for cls, count in class_counts.items()}

# Print the shapes of all the splits
print("X_train shape:", X_train_processed.shape)
print("X_test shape:", X_valid_processed.shape)
print("y_train_encoded shape:", y_train_encoded.shape)
print("y_test_encoded shape:", y_valid_encoded.shape)

# Models to train
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier()
}

# Results table
results = pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 
                                'Recall', 'F1 Score'])

# Train and evaluate models
for name, model in models.items():
    # Train the global model
    model.fit(X_train_processed, y_train_encoded)
    y_pred = model.predict(X_valid_processed)
    
    # Compute metrics
    accuracy = accuracy_score(y_valid_encoded, y_pred)
    precision = precision_score(y_valid_encoded, y_pred, average='macro')
    recall = recall_score(y_valid_encoded, y_pred, average='macro')
    f1 = f1_score(y_valid_encoded, y_pred, average='macro')
    
    # Add to results
    new_row = pd.DataFrame({
        'Model': name, 
        'Dataset': 'Global',
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall, 
        'F1 Score': f1
    }, index=[0])

    results = pd.concat([results, new_row], ignore_index=True)

Finished sorting columns
Finished processor scaling
X_train shape: (147654, 154)
X_test shape: (37020, 154)
y_train_encoded shape: (147654,)
y_test_encoded shape: (37020,)


In [41]:
results

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,Global,0.682,0.67508,0.670566,0.671893
1,KNN,Global,0.632135,0.6239,0.62277,0.623198
2,Random Forest,Global,0.681374,0.675614,0.665052,0.666707
3,XGBoost,Global,0.685157,0.678514,0.672741,0.674266


In [43]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# One-hot encode the labels
y_train_one_hot = to_categorical(y_train_encoded, num_classes=2)
y_valid_one_hot = to_categorical(y_valid_encoded, num_classes=2)

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Define the FCNN model
def create_fcnn_model(input_dim, num_classes):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the FCNN model
fcnn_model = create_fcnn_model(X_train_processed.shape[1], 2)

# Fit the model to the training data
fcnn_model.fit(X_train_processed, y_train_one_hot, epochs=30, batch_size=32, 
               class_weight=class_weights, validation_data=(X_valid_processed, y_valid_one_hot),
               callbacks=[early_stopping])

# Evaluate the model on the validation set
y_pred_fcnn = fcnn_model.predict(X_valid_processed)

# Convert predictions from one-hot to label encoding for metric computation
y_pred_fcnn_labels = np.argmax(y_pred_fcnn, axis=1)

# Compute metrics for FCNN
accuracy_fcnn = accuracy_score(y_valid_encoded, y_pred_fcnn_labels)
precision_fcnn = precision_score(y_valid_encoded, y_pred_fcnn_labels, average='macro')
recall_fcnn = recall_score(y_valid_encoded, y_pred_fcnn_labels, average='macro')
f1_fcnn = f1_score(y_valid_encoded, y_pred_fcnn_labels, average='macro')

# Add FCNN results to the table
fcnn_row = pd.DataFrame({
    'Model': 'FCNN', 
    'Dataset': 'Global',
    'Accuracy': accuracy_fcnn,
    'Precision': precision_fcnn,
    'Recall': recall_fcnn, 
    'F1 Score': f1_fcnn
}, index=[0])

results = pd.concat([results, fcnn_row], ignore_index=True)

2024-01-12 02:07:20.936758: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-12 02:07:20.961515: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


In [44]:
results

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,Global,0.682,0.67508,0.670566,0.671893
1,KNN,Global,0.632135,0.6239,0.62277,0.623198
2,Random Forest,Global,0.681374,0.675614,0.665052,0.666707
3,XGBoost,Global,0.685157,0.678514,0.672741,0.674266
4,FCNN,Global,0.68249,0.676402,0.676859,0.676615


## Regional Modelling

In [50]:
from sklearn.calibration import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

col_to_drop = []
X = data.drop(columns=['blood_pressure'] + col_to_drop).sample(frac=1, random_state=42)
y = data['blood_pressure'].reindex(X.index)

# Use the 'split' column to separate training and validation sets
train_indices = data[data['split'] == 'TRAIN'].index
valid_indices = data[data['split'] == 'TEST'].index
X_train, y_train = X.loc[train_indices], y.loc[train_indices]
X_valid, y_valid = X.loc[valid_indices], y.loc[valid_indices]
print("Finished sorting columns")

# Preprocessing steps
categorical_features = X.columns[X.dtypes == 'object'].tolist()
numeric_features = X.columns[X.dtypes != 'object'].tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)
print("Finished processor scaling")

# Count the occurrences of each class
unique, counts = np.unique(y_train_encoded, return_counts=True)
class_counts = dict(zip(unique, counts))

# Calculate class weights
total_instances = len(y_train_encoded)
class_weights = {cls: total_instances/count for cls, count in class_counts.items()}

# Print the shapes of all the splits
print("X_train shape:", X_train_processed.shape)
print("X_test shape:", X_valid_processed.shape)
print("y_train_encoded shape:", y_train_encoded.shape)
print("y_test_encoded shape:", y_valid_encoded.shape)

# Models to train
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier()
}

# Region dictionary
regions = {
    'Sub-Saharan Africa': [
        'ethiopia', 'eswatini', 'chad', 'gambia', 'malawi', 'niger', 'togo', 'benin',
        'uganda', 'mozambique', 'zambia', 'lesotho', 'namibia', 'tanzania', 'liberia',
        'ghana', 'rwanda', 'guinea'
    ],
    'East Asia and Pacific': [
        'palau', 'micronesia', 'fiji', 'vanuatu', 'mongolia', 'myanmar', 'tonga',
        'samoa', 'niue', 'kiribati'
    ],
    'South Asia': [
        'bhutan', 'maldives', 'nepal', 'bangladesh', 'afghanistan'
    ],
    'Middle East and North Africa': [
        'qatar', 'libya', 'algeria'
    ],
    'Europe and Central Asia': [
        'georgia', 'belarus', 'moldova', 'azerbaijan', 'armenia', 'kyrgyzstan'
    ],
    'Latin America and the Caribbean': [
        'bahamas', 'barbados', 'guyana', 'ecuador', 'grenada'
    ]
}

# Define the FCNN model
def create_fcnn_model(input_dim, num_classes):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Results table
results = pd.DataFrame(columns=['Region', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Train and evaluate models for each region
for region, countries in regions.items():
    # Filter data for the current region
    X_train_region = X_train[X_train['country'].isin(countries)]
    y_train_region = y_train[X_train['country'].isin(countries)]
    X_valid_region = X_valid[X_valid['country'].isin(countries)]
    y_valid_region = y_valid[X_valid['country'].isin(countries)]
    
    # Preprocess the data for the current region
    X_train_region_processed = preprocessor.fit_transform(X_train_region)
    X_valid_region_processed = preprocessor.transform(X_valid_region)
    
    # Encode labels for the current region
    y_train_region_encoded = label_encoder.fit_transform(y_train_region)
    y_valid_region_encoded = label_encoder.transform(y_valid_region)
    
    # One-hot encode the labels for the current region
    y_train_region_one_hot = to_categorical(y_train_region_encoded, num_classes=2)
    y_valid_region_one_hot = to_categorical(y_valid_region_encoded, num_classes=2)
    
    # Train and evaluate models for the current region
    for name, model in models.items():
        # Train the model for the current region
        model.fit(X_train_region_processed, y_train_region_encoded)
        y_pred_region = model.predict(X_valid_region_processed)
        
        # Compute metrics for the current region
        accuracy = accuracy_score(y_valid_region_encoded, y_pred_region)
        precision = precision_score(y_valid_region_encoded, y_pred_region, average='macro')
        recall = recall_score(y_valid_region_encoded, y_pred_region, average='macro')
        f1 = f1_score(y_valid_region_encoded, y_pred_region, average='macro')
        
        # Add to results
        new_row = pd.DataFrame({
            'Region': region,
            'Model': name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        }, index=[0])
        results = pd.concat([results, new_row], ignore_index=True)
    
    # Create the FCNN model for the current region
    fcnn_model = create_fcnn_model(X_train_region_processed.shape[1], 2)
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Fit the FCNN model to the training data for the current region
    fcnn_model.fit(X_train_region_processed, y_train_region_one_hot, epochs=30, batch_size=32, 
                   validation_data=(X_valid_region_processed, y_valid_region_one_hot),
                   callbacks=[early_stopping])
    
    # Evaluate the FCNN model on the validation set for the current region
    y_pred_fcnn_region = fcnn_model.predict(X_valid_region_processed)
    
    # Convert predictions from one-hot to label encoding for metric computation
    y_pred_fcnn_region_labels = np.argmax(y_pred_fcnn_region, axis=1)
    
    # Compute metrics for FCNN for the current region
    accuracy_fcnn = accuracy_score(y_valid_region_encoded, y_pred_fcnn_region_labels)
    precision_fcnn = precision_score(y_valid_region_encoded, y_pred_fcnn_region_labels, average='macro')
    recall_fcnn = recall_score(y_valid_region_encoded, y_pred_fcnn_region_labels, average='macro')
    f1_fcnn = f1_score(y_valid_region_encoded, y_pred_fcnn_region_labels, average='macro')
    
    # Add FCNN results for the current region to the table
    fcnn_row = pd.DataFrame({
        'Region': region,
        'Model': 'FCNN',
        'Accuracy': accuracy_fcnn,
        'Precision': precision_fcnn,
        'Recall': recall_fcnn,
        'F1 Score': f1_fcnn
    }, index=[0])
    
    results = pd.concat([results, fcnn_row], ignore_index=True)

2024-03-13 11:20:37.534121: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-13 11:20:37.646118: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 11:20:37.646157: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 11:20:37.662804: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-13 11:20:37.697388: I tensorflow/core/platform/cpu_feature_guar

Finished sorting columns
Finished processor scaling
X_train shape: (147654, 154)
X_test shape: (37020, 154)
y_train_encoded shape: (147654,)
y_test_encoded shape: (37020,)


2024-03-13 11:20:48.791381: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-13 11:20:48.868616: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30


In [51]:
# Print the results
results

Unnamed: 0,Region,Model,Accuracy,Precision,Recall,F1 Score
0,Sub-Saharan Africa,Logistic Regression,0.648861,0.644065,0.641447,0.642049
1,Sub-Saharan Africa,KNN,0.599936,0.594048,0.592844,0.593083
2,Sub-Saharan Africa,Random Forest,0.643888,0.639734,0.631392,0.631252
3,Sub-Saharan Africa,XGBoost,0.64437,0.639408,0.635315,0.63586
4,Sub-Saharan Africa,FCNN,0.649583,0.646307,0.647006,0.64655
5,East Asia and Pacific,Logistic Regression,0.700882,0.693892,0.688995,0.690592
6,East Asia and Pacific,KNN,0.646561,0.637632,0.636127,0.636703
7,East Asia and Pacific,Random Forest,0.710229,0.706572,0.692152,0.694847
8,East Asia and Pacific,XGBoost,0.700353,0.693469,0.687624,0.689394
9,East Asia and Pacific,FCNN,0.71164,0.707025,0.695255,0.697858


## Country Modelling

In [52]:
from sklearn.calibration import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

col_to_drop = []
X = data.drop(columns=['blood_pressure'] + col_to_drop).sample(frac=1, random_state=42)
y = data['blood_pressure'].reindex(X.index)

# Use the 'split' column to separate training and validation sets
train_indices = data[data['split'] == 'TRAIN'].index
valid_indices = data[data['split'] == 'TEST'].index
X_train, y_train = X.loc[train_indices], y.loc[train_indices]
X_valid, y_valid = X.loc[valid_indices], y.loc[valid_indices]
print("Finished sorting columns")

# Preprocessing steps
categorical_features = X.columns[X.dtypes == 'object'].tolist()
numeric_features = X.columns[X.dtypes != 'object'].tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)
print("Finished processor scaling")

# Count the occurrences of each class
unique, counts = np.unique(y_train_encoded, return_counts=True)
class_counts = dict(zip(unique, counts))

# Calculate class weights
total_instances = len(y_train_encoded)
class_weights = {cls: total_instances/count for cls, count in class_counts.items()}

# Print the shapes of all the splits
print("X_train shape:", X_train_processed.shape)
print("X_test shape:", X_valid_processed.shape)
print("y_train_encoded shape:", y_train_encoded.shape)
print("y_test_encoded shape:", y_valid_encoded.shape)

# Models to train
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier()
}

# Define the FCNN model
def create_fcnn_model(input_dim, num_classes):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# List of unique countries
unique_countries = X_train['country'].unique()

# Results table
results = pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Train and evaluate models for each country
for country in unique_countries:
    # Filter data for the current country
    X_train_country = X_train[X_train['country'] == country]
    y_train_country = y_train[X_train['country'] == country]
    X_valid_country = X_valid[X_valid['country'] == country]
    y_valid_country = y_valid[X_valid['country'] == country]
    
    # Preprocess the data for the current country
    X_train_country_processed = preprocessor.fit_transform(X_train_country)
    X_valid_country_processed = preprocessor.transform(X_valid_country)
    
    # Encode labels for the current country
    y_train_country_encoded = label_encoder.fit_transform(y_train_country)
    y_valid_country_encoded = label_encoder.transform(y_valid_country)
    
    # One-hot encode the labels for the current country
    y_train_country_one_hot = to_categorical(y_train_country_encoded, num_classes=2)
    y_valid_country_one_hot = to_categorical(y_valid_country_encoded, num_classes=2)
    
    # Train and evaluate models for the current country
    for name, model in models.items():
        # Train the model for the current country
        model.fit(X_train_country_processed, y_train_country_encoded)
        y_pred_country = model.predict(X_valid_country_processed)
        
        # Compute metrics for the current country
        accuracy = accuracy_score(y_valid_country_encoded, y_pred_country)
        precision = precision_score(y_valid_country_encoded, y_pred_country, average='macro')
        recall = recall_score(y_valid_country_encoded, y_pred_country, average='macro')
        f1 = f1_score(y_valid_country_encoded, y_pred_country, average='macro')
        
        # Add to results
        new_row = pd.DataFrame({
            'Model': name,
            'Dataset': country,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        }, index=[0])
        results = pd.concat([results, new_row], ignore_index=True)
    
    # Create the FCNN model for the current country
    fcnn_model = create_fcnn_model(X_train_country_processed.shape[1], 2)
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Fit the FCNN model to the training data for the current country
    fcnn_model.fit(X_train_country_processed, y_train_country_one_hot, epochs=30, batch_size=32, 
                   validation_data=(X_valid_country_processed, y_valid_country_one_hot),
                   callbacks=[early_stopping])
    
    # Evaluate the FCNN model on the validation set for the current country
    y_pred_fcnn_country = fcnn_model.predict(X_valid_country_processed)
    
    # Convert predictions from one-hot to label encoding for metric computation
    y_pred_fcnn_country_labels = np.argmax(y_pred_fcnn_country, axis=1)
    
    # Compute metrics for FCNN for the current country
    accuracy_fcnn = accuracy_score(y_valid_country_encoded, y_pred_fcnn_country_labels)
    precision_fcnn = precision_score(y_valid_country_encoded, y_pred_fcnn_country_labels, average='macro')
    recall_fcnn = recall_score(y_valid_country_encoded, y_pred_fcnn_country_labels, average='macro')
    f1_fcnn = f1_score(y_valid_country_encoded, y_pred_fcnn_country_labels, average='macro')
    
    # Add FCNN results for the current country to the table
    fcnn_row = pd.DataFrame({
        'Model': 'FCNN',
        'Dataset': country,
        'Accuracy': accuracy_fcnn,
        'Precision': precision_fcnn,
        'Recall': recall_fcnn,
        'F1 Score': f1_fcnn
    }, index=[0])
    
    results = pd.concat([results, fcnn_row], ignore_index=True)

Finished sorting columns
Finished processor scaling
X_train shape: (147654, 154)
X_test shape: (37020, 154)
y_train_encoded shape: (147654,)
y_test_encoded shape: (37020,)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 1/30
Ep

In [53]:
# Print the results
results

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,ethiopia,0.625536,0.625842,0.623066,0.622245
1,KNN,ethiopia,0.589592,0.58878,0.588278,0.588184
2,Random Forest,ethiopia,0.624464,0.623993,0.622877,0.62274
3,XGBoost,ethiopia,0.598712,0.598074,0.597907,0.597934
4,FCNN,ethiopia,0.622854,0.623633,0.619993,0.618681
5,Logistic Regression,georgia,0.733831,0.705691,0.688705,0.69465
6,KNN,georgia,0.662935,0.622973,0.616784,0.619009
7,Random Forest,georgia,0.717662,0.690813,0.648612,0.655733
8,XGBoost,georgia,0.702736,0.668253,0.653978,0.658629
9,FCNN,georgia,0.725124,0.696775,0.668602,0.676023
