# Processor Notebook

## Imports and setup

In [1]:
import pandas as pd

%load_ext autoreload
%autoreload 2

from base.baseprocess import DataProcessor
from util.constants import Constants
from util.mapping import CategoricalMapping

In [2]:
# change None to specific number to shorten the display list, 'None' to display the full list
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Load Data

In [3]:
# import data
data = pd.read_csv(Constants.HARMONIZER_FILE_PATH, low_memory=False)

In [4]:
data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,earnings_per_year,currently_smoke_tobacco,age_started_smoking,length_time_smoking,number_tobacco,type_tobacco,age_stopped_smoking,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,reading1_systolic,reading1_diastolic,reading2_systolic,reading2_diastolic,reading3_systolic,reading3_diastolic,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,reading1_bpm,reading2_bpm,reading3_bpm,fasting_blood_glucose,total_cholesterol,urinary_sodium,urinary_creatinine,triglycerides,hdl_cholesterol
0,ethiopia,,,0.0,1,2.0,8.0,2.0,,2.0,,,,,,2.0,1.0,,5.0,0.0,3.0,vigorous-intensity,2.0,7.0,120.0,120.0,2.0,,2.0,,2.0,,2.0,2.0,92.0,71.0,92.0,67.0,97.0,65.0,2.0,2.0,150.5,45.3,68.0,84.0,83.0,81.0,88.0,77.0,113.0,,,52.6,39.0
1,ethiopia,,,0.0,1,2.0,8.0,2.0,,2.0,,,,,,2.0,2.0,,,0.0,3.0,vigorous-intensity,4.0,5.0,30.0,60.0,2.0,,2.0,,2.0,,2.0,2.0,88.0,53.0,84.0,30.0,89.0,53.0,2.0,2.0,155.5,40.5,63.0,84.0,98.0,97.0,102.0,76.0,100.0,,,85.5,36.0
2,ethiopia,,,4.0,2,1.0,5.0,4.0,,2.0,,,,,,2.0,1.0,,3.0,0.0,3.0,moderate-intensity,,3.0,,180.0,2.0,,2.0,,2.0,,2.0,2.0,122.0,79.0,118.0,74.0,99.0,69.0,2.0,2.0,153.0,51.1,64.0,85.0,77.0,74.0,75.0,78.0,112.0,,,61.2,25.0
3,ethiopia,,,0.0,1,2.0,5.0,1.0,,2.0,,,,,,2.0,1.0,,4.0,0.0,3.0,moderate-intensity,,2.0,,150.0,2.0,,2.0,,2.0,,2.0,2.0,110.0,78.0,104.0,75.0,113.0,80.0,2.0,2.0,160.0,43.4,62.0,78.0,77.0,76.0,73.0,70.0,100.0,,,27.1,58.0
4,ethiopia,,,0.0,1,1.0,8.0,3.0,,2.0,,,,,,2.0,2.0,,,0.0,4.0,vigorous-intensity,7.0,7.0,30.0,120.0,2.0,,2.0,,2.0,,2.0,2.0,115.0,82.0,111.0,78.0,107.0,73.0,2.0,2.0,147.0,50.0,70.0,94.0,82.0,92.0,87.0,80.0,100.0,,,61.2,39.0


In [5]:
print(f"Data shape: {data.shape}")

Data shape: (218514, 55)


## Clean Data

In [6]:
# Create the DataProcessor instance and process the data
processor = DataProcessor(data, CategoricalMapping.get_mappings())

In [7]:
# process numeric variables
processed_data = processor.process_numeric_variables(data, Constants.PROCESSOR_NUMERIC_COLS)

In [8]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,earnings_per_year,currently_smoke_tobacco,age_started_smoking,length_time_smoking,number_tobacco,type_tobacco,age_stopped_smoking,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,reading1_systolic,reading1_diastolic,reading2_systolic,reading2_diastolic,reading3_systolic,reading3_diastolic,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,reading1_bpm,reading2_bpm,reading3_bpm,fasting_blood_glucose,total_cholesterol,urinary_sodium,urinary_creatinine,triglycerides,hdl_cholesterol
0,ethiopia,,,0.0,1,2.0,8.0,2.0,,2.0,,,,,,2.0,1.0,,5.0,0.0,3.0,vigorous-intensity,2.0,7.0,120.0,120.0,2.0,,2.0,,2.0,,2.0,2.0,92.0,71.0,92.0,67.0,97.0,65.0,2.0,2.0,150.5,45.3,68.0,84.0,83.0,81.0,88.0,77.0,113.0,,,52.6,39.0
2,ethiopia,,,4.0,2,1.0,5.0,4.0,,2.0,,,,,,2.0,1.0,,3.0,0.0,3.0,moderate-intensity,,3.0,,180.0,2.0,,2.0,,2.0,,2.0,2.0,122.0,79.0,118.0,74.0,99.0,69.0,2.0,2.0,153.0,51.1,64.0,85.0,77.0,74.0,75.0,78.0,112.0,,,61.2,25.0
3,ethiopia,,,0.0,1,2.0,5.0,1.0,,2.0,,,,,,2.0,1.0,,4.0,0.0,3.0,moderate-intensity,,2.0,,150.0,2.0,,2.0,,2.0,,2.0,2.0,110.0,78.0,104.0,75.0,113.0,80.0,2.0,2.0,160.0,43.4,62.0,78.0,77.0,76.0,73.0,70.0,100.0,,,27.1,58.0
4,ethiopia,,,0.0,1,1.0,8.0,3.0,,2.0,,,,,,2.0,2.0,,,0.0,4.0,vigorous-intensity,7.0,7.0,30.0,120.0,2.0,,2.0,,2.0,,2.0,2.0,115.0,82.0,111.0,78.0,107.0,73.0,2.0,2.0,147.0,50.0,70.0,94.0,82.0,92.0,87.0,80.0,100.0,,,61.2,39.0
5,ethiopia,,,4.0,2,1.0,7.0,4.0,,2.0,,,,,,2.0,1.0,,4.0,4.0,3.0,vigorous-intensity,5.0,3.0,90.0,60.0,2.0,,2.0,,2.0,,2.0,2.0,110.0,69.0,94.0,67.0,98.0,65.0,2.0,,158.5,42.6,65.0,81.0,71.0,78.0,83.0,73.0,100.0,,,78.2,25.0


In [9]:
print(f"Shape of data (numeric cleaning): {processed_data.shape}")

Shape of data (numeric cleaning): (190575, 55)


In [10]:
# process categorical variables
processed_data = processor.process_categorical_variables(processed_data, Constants.PROCESSOR_CATEGORICAL_COLS)

In [11]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,earnings_per_year,currently_smoke_tobacco,age_started_smoking,length_time_smoking,number_tobacco,type_tobacco,age_stopped_smoking,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,reading1_systolic,reading1_diastolic,reading2_systolic,reading2_diastolic,reading3_systolic,reading3_diastolic,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,reading1_bpm,reading2_bpm,reading3_bpm,fasting_blood_glucose,total_cholesterol,urinary_sodium,urinary_creatinine,triglycerides,hdl_cholesterol
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,,no,,,,missing,,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,92.0,71.0,92.0,67.0,97.0,65.0,no,no,150.5,45.3,68.0,84.0,83.0,81.0,88.0,77.0,113.0,,,52.6,39.0
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,,no,,,,missing,,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,122.0,79.0,118.0,74.0,99.0,69.0,no,no,153.0,51.1,64.0,85.0,77.0,74.0,75.0,78.0,112.0,,,61.2,25.0
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,,no,,,,missing,,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,110.0,78.0,104.0,75.0,113.0,80.0,no,no,160.0,43.4,62.0,78.0,77.0,76.0,73.0,70.0,100.0,,,27.1,58.0
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,,no,,,,missing,,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,115.0,82.0,111.0,78.0,107.0,73.0,no,no,147.0,50.0,70.0,94.0,82.0,92.0,87.0,80.0,100.0,,,61.2,39.0
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,,no,,,,missing,,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,110.0,69.0,94.0,67.0,98.0,65.0,no,missing,158.5,42.6,65.0,81.0,71.0,78.0,83.0,73.0,100.0,,,78.2,25.0


In [12]:
print(f"Shape of data (categorical cleaning): {processed_data.shape}")

Shape of data (categorical cleaning): (190575, 55)


## Preprocess Columns and Create Targets

In [13]:
# preprocess columns and create targets
processed_data = processor.preproc_cols_and_create_targets(processed_data)

In [14]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,earnings_per_year,currently_smoke_tobacco,age_started_smoking,length_time_smoking,number_tobacco,type_tobacco,age_stopped_smoking,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,fasting_blood_glucose,total_cholesterol,urinary_sodium,urinary_creatinine,triglycerides,hdl_cholesterol,reading_bpm,blood_pressure
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,,no,,,,missing,,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,150.5,45.3,68.0,84.0,77.0,113.0,,,52.6,39.0,84.0,normal
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,,no,,,,missing,,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,no,no,153.0,51.1,64.0,85.0,78.0,112.0,,,61.2,25.0,75.333333,normal
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,,no,,,,missing,,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,no,no,160.0,43.4,62.0,78.0,70.0,100.0,,,27.1,58.0,75.333333,normal
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,,no,,,,missing,,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,147.0,50.0,70.0,94.0,80.0,100.0,,,61.2,39.0,87.0,normal
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,,no,,,,missing,,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,no,missing,158.5,42.6,65.0,81.0,73.0,100.0,,,78.2,25.0,77.333333,normal


In [15]:
print(f"Preprocess columns and create targets: {processed_data.shape}")

Preprocess columns and create targets: (184674, 48)


In [16]:
# remove column with more than 80% missing value
processed_data = processor.remove_columns_with_missing_values(processed_data)

In [17]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,currently_smoke_tobacco,type_tobacco,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,fasting_blood_glucose,total_cholesterol,triglycerides,hdl_cholesterol,reading_bpm,blood_pressure
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,no,missing,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,150.5,45.3,68.0,84.0,77.0,113.0,52.6,39.0,84.0,normal
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,no,missing,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,no,no,153.0,51.1,64.0,85.0,78.0,112.0,61.2,25.0,75.333333,normal
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,no,missing,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,no,no,160.0,43.4,62.0,78.0,70.0,100.0,27.1,58.0,75.333333,normal
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,no,missing,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,147.0,50.0,70.0,94.0,80.0,100.0,61.2,39.0,87.0,normal
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,no,missing,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,no,missing,158.5,42.6,65.0,81.0,73.0,100.0,78.2,25.0,77.333333,normal


In [18]:
print(f"Remove column with more than 80% missing value: {processed_data.shape}")

Remove column with more than 80% missing value: (184674, 41)


In [19]:
# Save the processed data for baseline characterstic calculation to the output file
processed_data.to_csv(Constants.BASELINE_FILE_PATH, index=False)
print(f"Baseline data saved to {Constants.BASELINE_FILE_PATH}")

Baseline data saved to data/processed/baseline.csv


In [20]:
# Save the processed data to the output file
processed_data.to_csv(Constants.PROCESSOR_FILE_PATH, index=False)
print(f"Processed data saved to {Constants.PROCESSOR_FILE_PATH}")

Processed data saved to data/processed/processed.csv


In [21]:
processed_data.head()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,currently_smoke_tobacco,type_tobacco,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,fasting_blood_glucose,total_cholesterol,triglycerides,hdl_cholesterol,reading_bpm,blood_pressure
0,ethiopia,missing,,0.0,no formal schooling,married,unemployed,2.0,no,missing,no,yes,missing,5.0,0.0,normal,vigorous-intensity,2.0,7.0,120.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,150.5,45.3,68.0,84.0,77.0,113.0,52.6,39.0,84.0,normal
2,ethiopia,missing,,4.0,elementary school,not married,student,4.0,no,missing,no,yes,missing,3.0,0.0,normal,moderate-intensity,,3.0,,180.0,no,missing,no,missing,no,missing,no,no,no,no,153.0,51.1,64.0,85.0,78.0,112.0,61.2,25.0,75.333333,normal
3,ethiopia,missing,,0.0,no formal schooling,married,student,1.0,no,missing,no,yes,missing,4.0,0.0,normal,moderate-intensity,,2.0,,150.0,no,missing,no,missing,no,missing,no,no,no,no,160.0,43.4,62.0,78.0,70.0,100.0,27.1,58.0,75.333333,normal
4,ethiopia,missing,,0.0,no formal schooling,not married,unemployed,3.0,no,missing,no,no,missing,,0.0,low,vigorous-intensity,7.0,7.0,30.0,120.0,no,missing,no,missing,no,missing,no,no,no,no,147.0,50.0,70.0,94.0,80.0,100.0,61.2,39.0,87.0,normal
5,ethiopia,missing,,4.0,elementary school,not married,retired,4.0,no,missing,no,yes,missing,4.0,4.0,normal,vigorous-intensity,5.0,3.0,90.0,60.0,no,missing,no,missing,no,missing,no,no,no,missing,158.5,42.6,65.0,81.0,73.0,100.0,78.2,25.0,77.333333,normal


In [22]:
processed_data.tail()

Unnamed: 0,country,sex,age,years_at_school,level_of_education,marital_status,work_status,ppl_in_household,currently_smoke_tobacco,type_tobacco,smoke_home_workplace,consumed_alcohol,quit_drinking_for_health,number_alcoholic_drinks,number_daily_fruit_vegetables,salt_consumption,work_intensity,days_vigorous_exercise,days_moderate_exercise,time_walking_bicycling_minutes,time_sedentary,had_blood_pressure_measurement,taken_drugs_for_raised_bp,had_blood_sugar_measurement,taken_diabetes_drugs,had_cholesterol_measurement,taken_cholesterol_oral_treatment,had_heart_attack,taking_heart_disease_medication,treated_for_raised_bp,are_you_pregnant,height,weight,waist_circumference,hip_circumference,fasting_blood_glucose,total_cholesterol,triglycerides,hdl_cholesterol,reading_bpm,blood_pressure
218508,guinea,female,15.0,0.0,no formal schooling,married,homemaker,2.0,no,missing,yes,no,missing,,42.0,missing,missing,,,90.0,120.0,no,missing,no,missing,missing,missing,missing,missing,no,no,156.2,53.6,70.0,95.0,54.0,179.0,,,75.0,normal
218509,guinea,female,30.0,13.0,tertiary,married,employed,1.0,no,missing,no,no,missing,,28.0,missing,missing,,,20.0,390.0,yes,missing,no,missing,missing,missing,missing,missing,no,yes,158.5,79.9,104.0,108.6,67.0,150.0,,,89.666667,normal
218510,guinea,male,30.0,0.0,no formal schooling,married,employed,1.0,yes,missing,no,no,missing,,22.0,missing,vigorous-intensity,5.0,2.0,360.0,120.0,no,missing,no,missing,missing,missing,missing,missing,no,missing,165.8,76.9,81.0,93.0,91.0,188.0,,,100.666667,high
218512,guinea,female,38.0,0.0,no formal schooling,married,employed,3.0,no,missing,no,no,missing,,35.0,missing,vigorous-intensity,7.0,3.0,120.0,120.0,yes,missing,no,missing,missing,missing,missing,missing,no,no,157.0,62.5,95.0,99.0,68.0,150.0,,,85.0,high
218513,guinea,female,15.0,11.0,high school,not married,student,3.0,no,missing,no,no,missing,,12.0,missing,missing,,,30.0,180.0,no,missing,no,missing,missing,missing,missing,missing,no,no,164.0,43.0,69.0,81.0,61.0,175.0,,,99.0,normal


## Class Balancing

In [23]:
def calculate_class_distribution(df, group_col, target_col):
    """
    Calculates the class distribution of a target column, grouped by another column.

    Parameters:
    - df: pandas DataFrame, the dataset containing the columns.
    - group_col: str, the name of the column to group by.
    - target_col: str, the name of the target variable column.

    Returns:
    - pandas DataFrame containing the class distribution for each category of the target column, 
      grouped by the specified group column.
    """

    # Calculate the class distribution grouped by the specified column
    class_distribution = df.groupby([group_col, target_col]).size().unstack(fill_value=0)

    # Reset the index and remove the index name for better formatting
    class_distribution.reset_index(inplace=True)
    class_distribution.columns.name = None

    return class_distribution


In [24]:
class_dist = calculate_class_distribution(processed_data, 'country', 'blood_pressure')

class_dist.head()

Unnamed: 0,country,high,normal
0,afghanistan,2015,1588
1,algeria,3114,2991
2,armenia,1285,538
3,azerbaijan,1601,894
4,bahamas,872,483


## Preparation for Modelling

In [25]:
import pandas as pd

def train_test_split(df, target_col, test_size=0.2, random_state=42):
    """
    Splits the dataset into a train and test set, ensuring a balanced distribution of 
    the target classes per country in the test set.

    Parameters:
    - df: pandas DataFrame, the dataset to be split.
    - target_col: str, the name of the target variable column.
    - test_size: float, the proportion of the dataset to include in the test split.
    - random_state: int, controls the shuffling applied to the data before applying the split.

    Returns:
    - X_train: Training features.
    - X_test: Testing features.
    - y_train: Training target variable.
    - y_test: Testing target variable.
    """

    # Create empty DataFrames for training and test sets
    train_set = pd.DataFrame(columns=df.columns)
    test_set = pd.DataFrame(columns=df.columns)

    # Group data by country and target column and sample from each subgroup
    for (country, bp_class), group in df.groupby(['country', target_col]):
        # print(f"{country} - {bp_class}")
        test_subset = group.sample(frac=test_size, random_state=random_state)  # Sample for test set
        train_subset = group.drop(test_subset.index)  # Remaining data for training set

        # Append subsets to the respective training and test set DataFrames
        train_set = train_set.append(train_subset)
        test_set = test_set.append(test_subset)

    # Splitting the training and test sets into features and target
    X_train = train_set.drop(target_col, axis=1)
    y_train = train_set[target_col]
    X_test = test_set.drop(target_col, axis=1)
    y_test = test_set[target_col]

    return X_train, X_test, y_train, y_test

In [26]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

def balance_dataset_by_group(df, group_col, target_col):
    """
    Balances the dataset using SMOTE, ensuring equal representation of the target classes within each group.

    Parameters:
    - df: pandas DataFrame, the dataset to be balanced.
    - group_col: str, the name of the column used for grouping.
    - target_col: str, the name of the target variable column.

    Returns:
    - pandas DataFrame, a balanced dataset.
    """
    balanced_df = pd.DataFrame()
    smote = SMOTE(random_state=42)

    # Identify categorical and numerical columns
    categorical_cols = [col for col in df.select_dtypes(include=['object', 'category']).columns 
                        if col != target_col and col != group_col]
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Define transformers for the ColumnTransformer
    transformers = [
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')), 
            ('scaler', StandardScaler())
        ]), numerical_cols)
    ]

    if categorical_cols:
        transformers.append(
            ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_cols)
        )

    preprocessor = ColumnTransformer(transformers=transformers)

    # Separate features and target
    X = df.drop([group_col, target_col], axis=1)
    y = df[target_col]

    # Apply preprocessor to the entire dataset
    X_transformed = preprocessor.fit_transform(X)

    # Reconstruct the entire DataFrame with transformed features
    all_features = numerical_cols
    if categorical_cols:
        transformed_cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out()
        all_features += transformed_cat_features.tolist()
    transformed_df = pd.DataFrame(X_transformed, columns=all_features)
    transformed_df[target_col] = y
    transformed_df[group_col] = df[group_col]

    # Now process each group
    for group in df[group_col].unique():
        group_df = transformed_df[transformed_df[group_col] == group]
        
        # Skip groups with no samples
        if group_df.empty:
            print(f"Skipping group {group} as it has no samples.")
            continue

        X_group = group_df.drop([group_col, target_col], axis=1)
        y_group = group_df[target_col]

        # Check if the group has enough samples for SMOTE
        if X_group.shape[0] < 2:
            print(f"Skipping group {group} as it does not have enough samples for SMOTE.")
            continue

        # Apply SMOTE
        X_group_balanced, y_group_balanced = smote.fit_resample(X_group, y_group)

        # Reconstruct the balanced group DataFrame
        balanced_group_df = pd.DataFrame(X_group_balanced, columns=all_features)
        balanced_group_df[target_col] = y_group_balanced
        balanced_group_df[group_col] = [group] * len(balanced_group_df)

        balanced_df = pd.concat([balanced_df, balanced_group_df], axis=0)


    return balanced_df

ModuleNotFoundError: No module named 'imblearn'

In [None]:
from sklearn.preprocessing import LabelEncoder

def prepare_and_process_data(df, target_col, group_col, test_size=0.2, random_state=42):
    """
    Prepares and processes the data for machine learning modeling, ensuring balanced class distribution
    and balanced distribution of the target classes per country in the test set.

    Parameters:
    - df: pandas DataFrame, the dataset to be processed.
    - target_col: str, the name of the target variable column.
    - group_col: str, the name of the column used for grouping.
    - test_size: float, the proportion of the dataset to include in the test split.
    - random_state: int, controls the shuffling applied to the data before applying the split.

    Returns:
    - dict: A dictionary containing transformed and split features, encoded target variables,
            LabelEncoder, and ColumnTransformer instances.
    """
    
    # Balance the dataset
    balanced_df = balance_dataset_by_group(df, group_col=group_col, 
                                           target_col=target_col)

    # Split the data into training and testing sets using custom train_test_split
    X_train, X_test, y_train, y_test = train_test_split(balanced_df, target_col, 
                                                        test_size, random_state)

    # Encode the target variable
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    return {
        'X_train': X_train,
        'X_test': X_test,
        'y_train_encoded': y_train_encoded,
        'y_test_encoded': y_test_encoded,
        'label_encoder': label_encoder
    }

In [None]:
def run_preprocess(df):
    data_dict = prepare_and_process_data(df, target_col='blood_pressure', group_col='country')

    # Accessing elements from the dictionary
    X_train = data_dict['X_train']
    X_test = data_dict['X_test']
    y_train_encoded = data_dict['y_train_encoded']
    y_test_encoded = data_dict['y_test_encoded']
    # label_encoder = data_dict['label_encoder']
    
    return X_train, X_test, y_train_encoded, y_test_encoded

## Recursive Feature Elimination (RFE)

In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.ensemble import RandomForestClassifier
# import pandas as pd
# import numpy as np

# # Handling categorical variables
# # Replace 'object' with the appropriate type for categorical variables in your dataset
# categorical_cols = processed_data.select_dtypes(include=['object']).columns

# # Apply Label Encoding
# for col in categorical_cols:
#     processed_data[col] = LabelEncoder().fit_transform(processed_data[col])

# # Handling missing values
# # You can choose a different strategy based on your data
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# processed_data = pd.DataFrame(imputer.fit_transform(processed_data), columns=processed_data.columns)


# # Assuming processed_data is your DataFrame and is preprocessed
# # Splitting the data into features and target
# X = processed_data.drop('blood_pressure', axis=1)
# y = processed_data['blood_pressure']

# # Initialize the base classifier
# base_classifier = RandomForestClassifier()

# # Initialize RFE with the classifier, specifying the number of features to select
# # Here, I'm using 10 as an example. Adjust it based on your needs and computational resources
# rfe = RFE(estimator=base_classifier, n_features_to_select=10)

# # Fit RFE
# rfe.fit(X, y)

# # Get the ranking of the features and the most important features
# feature_ranking = rfe.ranking_
# important_features = X.columns[rfe.support_]

# # Print the ranking and important features
# print("Feature Ranking:", feature_ranking)
# print("Important Features:", important_features)

# # You can then use these important features for further modeling
# X_important = X[important_features]

## Train Global Model

In [None]:
# from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import pandas as pd

# Important features identified by RFE
important_features = ['age', 'number_daily_fruit_vegetables', 'time_sedentary', 
                      'height', 'weight', 'waist_circumference', 'hip_circumference',
                      'fasting_blood_glucose', 'total_cholesterol', 'reading_bpm',
                      'country', 'blood_pressure'] # blood_pressure is added as the target

X_train, X_test, y_train_encoded, y_test_encoded = run_preprocess(processed_data[important_features])

# Print the shapes of all the splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train_encoded shape:", y_train_encoded.shape)
print("y_test_encoded shape:", y_test_encoded.shape)

# Models to train
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)  # 'verbose=0' to silence CatBoost training output
}

# Results table
results = pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 
                                'Recall', 'F1 Score'])

# Drop country column
X_train = X_train.drop('country', axis=1)
X_test = X_test.drop('country', axis=1)

# Train and evaluate models
for name, model in models.items():    
    # Train the global model
    model.fit(X_train, y_train_encoded)
    y_pred = model.predict(X_test)
    
    # Compute metrics
    accuracy = accuracy_score(y_test_encoded, y_pred)
    precision = precision_score(y_test_encoded, y_pred, average='macro')
    recall = recall_score(y_test_encoded, y_pred, average='macro')
    f1 = f1_score(y_test_encoded, y_pred, average='macro')
    
    # Add to results
    results = results.append({
        'Model': name,
        'Dataset': 'Global',
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }, ignore_index=True)

Skipping group samoa as it has no samples.
Skipping group afghanistan as it has no samples.
Skipping group ghana as it has no samples.
Skipping group rwanda as it has no samples.
Skipping group niue as it has no samples.
Skipping group armenia as it has no samples.
Skipping group kiribati as it has no samples.
Skipping group algeria as it has no samples.
Skipping group kyrgyzstan as it has no samples.
Skipping group guinea as it has no samples.
X_train shape: (149880, 11)
X_test shape: (37474, 11)
y_train_encoded shape: (149880,)
y_test_encoded shape: (37474,)
[LightGBM] [Info] Number of positive: 74940, number of negative: 74940
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000788 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2544
[LightGBM] [Info] Number of data points in the train set: 149880, number of used features: 10
[LightGB

In [None]:
results

In [36]:
results

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,Global,0.503869,0.503876,0.503869,0.503644
1,KNN,Global,0.580082,0.581045,0.580082,0.578832
2,Random Forest,Global,0.62494,0.628648,0.62494,0.622218
3,XGBoost,Global,0.608368,0.62643,0.608368,0.593863
4,LightGBM,Global,0.611544,0.656583,0.611544,0.581446
5,CatBoost,Global,0.613118,0.643919,0.613118,0.591249


In [66]:
results

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,Global,0.654339,0.645444,0.637776,0.638896
1,KNN,Global,0.610099,0.600037,0.598527,0.598973
2,Random Forest,Global,0.662596,0.654206,0.647348,0.648672
3,XGBoost,Global,0.668526,0.660404,0.655723,0.656995
4,LightGBM,Global,0.669798,0.661756,0.6563,0.657667
5,CatBoost,Global,0.67321,0.665345,0.660071,0.66146


In [54]:
results

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,Global,0.512649,0.512673,0.512649,0.512414
1,KNN,Global,0.588221,0.591543,0.588221,0.584452
2,Random Forest,Global,0.651492,0.664645,0.651492,0.644389
3,XGBoost,Global,0.622832,0.646222,0.622832,0.60712
4,LightGBM,Global,0.621871,0.670765,0.621871,0.592718
5,CatBoost,Global,0.625794,0.654926,0.625794,0.607335


In [31]:
results

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,Global,0.512222,0.512244,0.512222,0.511997
1,KNN,Global,0.588221,0.591543,0.588221,0.584452
2,Random Forest,Global,0.652426,0.66635,0.652426,0.644997
3,XGBoost,Global,0.622832,0.646222,0.622832,0.60712


## Per Country Model

In [37]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),  # Increased max_iter for convergence
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
}

# Results table
results = pd.DataFrame(columns=['Model', 'Country', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Important features identified by RFE
important_features = ['age', 'number_daily_fruit_vegetables', 'time_sedentary', 
                      'height', 'weight', 'waist_circumference', 'hip_circumference',
                      'fasting_blood_glucose', 'total_cholesterol', 'reading_bpm',
                      'country', 'blood_pressure'] # blood_pressure is added as the target

X_train, X_test, y_train_encoded, y_test_encoded = run_preprocess(processed_data[important_features])

# Run data preprocess
# X_train, X_test, y_train_encoded, y_test_encoded = run_preprocess(processed_data)

# List of unique countries
unique_countries = X_train['country'].unique()

# Train and evaluate models per country
for country in unique_countries:
    # Subset data for the current country
    X_train_country = X_train[X_train['country'] == country].drop('country', axis=1)
    y_train_country = y_train_encoded[X_train['country'] == country]
    X_test_country = X_test[X_test['country'] == country].drop('country', axis=1)
    y_test_country = y_test_encoded[X_test['country'] == country]

    if len(X_train_country) == 0 or len(X_test_country) == 0:
        continue  # Skip if no data for this country

    for name, model in models.items():
        # Train the model for the current country
        model.fit(X_train_country, y_train_country)
        y_pred_country = model.predict(X_test_country)
        
        # Compute metrics for the current country
        accuracy = accuracy_score(y_test_country, y_pred_country)
        precision = precision_score(y_test_country, y_pred_country, average='macro', zero_division=0)
        recall = recall_score(y_test_country, y_pred_country, average='macro', zero_division=0)
        f1 = f1_score(y_test_country, y_pred_country, average='macro', zero_division=0)
        
        # Add results to the table
        results = results.append({
            'Model': name,
            'Country': country,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        }, ignore_index=True)

Skipping group samoa as it has no samples.
Skipping group afghanistan as it has no samples.
Skipping group ghana as it has no samples.
Skipping group rwanda as it has no samples.
Skipping group niue as it has no samples.
Skipping group armenia as it has no samples.
Skipping group kiribati as it has no samples.
Skipping group algeria as it has no samples.
Skipping group kyrgyzstan as it has no samples.
Skipping group guinea as it has no samples.


In [38]:
results

Unnamed: 0,Model,Country,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,azerbaijan,0.459375,0.459273,0.459375,0.459037
1,KNN,azerbaijan,0.623437,0.629456,0.623437,0.619009
2,Random Forest,azerbaijan,0.704688,0.706382,0.704688,0.70408
3,XGBoost,azerbaijan,0.632812,0.632918,0.632812,0.63274
4,Logistic Regression,bahamas,0.479885,0.479734,0.479885,0.478917
5,KNN,bahamas,0.646552,0.647649,0.646552,0.645894
6,Random Forest,bahamas,0.681034,0.681328,0.681034,0.680905
7,XGBoost,bahamas,0.649425,0.64992,0.649425,0.649136
8,Logistic Regression,bangladesh,0.516064,0.516351,0.516064,0.51393
9,KNN,bangladesh,0.484605,0.484605,0.484605,0.484601


In [46]:
results

Unnamed: 0,Model,Country,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,azerbaijan,0.498437,0.498432,0.498437,0.497995
1,KNN,azerbaijan,0.596875,0.61465,0.596875,0.58062
2,Random Forest,azerbaijan,0.707812,0.726097,0.707812,0.701783
3,XGBoost,azerbaijan,0.66875,0.674081,0.66875,0.666194
4,Logistic Regression,bahamas,0.577586,0.577712,0.577586,0.577415
5,KNN,bahamas,0.62931,0.655142,0.62931,0.61321
6,Random Forest,bahamas,0.735632,0.748779,0.735632,0.732093
7,XGBoost,bahamas,0.706897,0.711622,0.706897,0.705251
8,Logistic Regression,bangladesh,0.497323,0.497322,0.497323,0.497304
9,KNN,bangladesh,0.482597,0.482597,0.482597,0.482595


<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Model</th>
      <th>Country</th>
      <th>Accuracy</th>
      <th>Precision</th>
      <th>Recall</th>
      <th>F1 Score</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Logistic Regression</td>
      <td>azerbaijan</td>
      <td>0.498437</td>
      <td>0.498432</td>
      <td>0.498437</td>
      <td>0.497995</td>
    </tr>
    <tr>
      <th>1</th>
      <td>KNN</td>
      <td>azerbaijan</td>
      <td>0.596875</td>
      <td>0.614650</td>
      <td>0.596875</td>
      <td>0.580620</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Random Forest</td>
      <td>azerbaijan</td>
      <td>0.707812</td>
      <td>0.726097</td>
      <td>0.707812</td>
      <td>0.701783</td>
    </tr>
    <tr>
      <th>3</th>
      <td>XGBoost</td>
      <td>azerbaijan</td>
      <td>0.668750</td>
      <td>0.674081</td>
      <td>0.668750</td>
      <td>0.666194</td>
    </tr>
    <tr>
      <th>4</th>
      <td>Logistic Regression</td>
      <td>bahamas</td>
      <td>0.577586</td>
      <td>0.577712</td>
      <td>0.577586</td>
      <td>0.577415</td>
    </tr>
    <tr>
      <th>5</th>
      <td>KNN</td>
      <td>bahamas</td>
      <td>0.629310</td>
      <td>0.655142</td>
      <td>0.629310</td>
      <td>0.613210</td>
    </tr>
    <tr>
      <th>6</th>
      <td>Random Forest</td>
      <td>bahamas</td>
      <td>0.735632</td>
      <td>0.748779</td>
      <td>0.735632</td>
      <td>0.732093</td>
    </tr>
    <tr>
      <th>7</th>
      <td>XGBoost</td>
      <td>bahamas</td>
      <td>0.706897</td>
      <td>0.711622</td>
      <td>0.706897</td>
      <td>0.705251</td>
    </tr>
    <tr>
      <th>8</th>
      <td>Logistic Regression</td>
      <td>bangladesh</td>
      <td>0.497323</td>
      <td>0.497322</td>
      <td>0.497323</td>
      <td>0.497304</td>
    </tr>
    <tr>
      <th>9</th>
      <td>KNN</td>
      <td>bangladesh</td>
      <td>0.482597</td>
      <td>0.482597</td>
      <td>0.482597</td>
      <td>0.482595</td>
    </tr>
    <tr>
      <th>10</th>
      <td>Random Forest</td>
      <td>bangladesh</td>
      <td>0.492637</td>
      <td>0.492634</td>
      <td>0.492637</td>
      <td>0.492579</td>
    </tr>
    <tr>
      <th>11</th>
      <td>XGBoost</td>
      <td>bangladesh</td>
      <td>0.482597</td>
      <td>0.482597</td>
      <td>0.482597</td>
      <td>0.482595</td>
    </tr>
    <tr>
      <th>12</th>
      <td>Logistic Regression</td>
      <td>barbados</td>
      <td>0.578947</td>
      <td>0.584821</td>
      <td>0.578947</td>
      <td>0.571529</td>
    </tr>
    <tr>
      <th>13</th>
      <td>KNN</td>
      <td>barbados</td>
      <td>0.526316</td>
      <td>0.530449</td>
      <td>0.526316</td>
      <td>0.509677</td>
    </tr>
    <tr>
      <th>14</th>
      <td>Random Forest</td>
      <td>barbados</td>
      <td>0.710526</td>
      <td>0.710526</td>
      <td>0.710526</td>
      <td>0.710526</td>
    </tr>
    <tr>
      <th>15</th>
      <td>XGBoost</td>
      <td>barbados</td>
      <td>0.644737</td>
      <td>0.644837</td>
      <td>0.644737</td>
      <td>0.644675</td>
    </tr>
    <tr>
      <th>16</th>
      <td>Logistic Regression</td>
      <td>belarus</td>
      <td>0.588079</td>
      <td>0.588499</td>
      <td>0.588079</td>
      <td>0.587590</td>
    </tr>
    <tr>
      <th>17</th>
      <td>KNN</td>
      <td>belarus</td>
      <td>0.664901</td>
      <td>0.736788</td>
      <td>0.664901</td>
      <td>0.637378</td>
    </tr>
    <tr>
      <th>18</th>
      <td>Random Forest</td>
      <td>belarus</td>
      <td>0.862914</td>
      <td>0.882537</td>
      <td>0.862914</td>
      <td>0.861133</td>
    </tr>
    <tr>
      <th>19</th>
      <td>XGBoost</td>
      <td>belarus</td>
      <td>0.832450</td>
      <td>0.846688</td>
      <td>0.832450</td>
      <td>0.830712</td>
    </tr>
    <tr>
      <th>20</th>
      <td>Logistic Regression</td>
      <td>benin</td>
      <td>0.583183</td>
      <td>0.584827</td>
      <td>0.583183</td>
      <td>0.581152</td>
    </tr>
    <tr>
      <th>21</th>
      <td>KNN</td>
      <td>benin</td>
      <td>0.629295</td>
      <td>0.643867</td>
      <td>0.629295</td>
      <td>0.619663</td>
    </tr>
    <tr>
      <th>22</th>
      <td>Random Forest</td>
      <td>benin</td>
      <td>0.723327</td>
      <td>0.730877</td>
      <td>0.723327</td>
      <td>0.721047</td>
    </tr>
    <tr>
      <th>23</th>
      <td>XGBoost</td>
      <td>benin</td>
      <td>0.657324</td>
      <td>0.659667</td>
      <td>0.657324</td>
      <td>0.656062</td>
    </tr>
    <tr>
      <th>24</th>
      <td>Logistic Regression</td>
      <td>bhutan</td>
      <td>0.511346</td>
      <td>0.511347</td>
      <td>0.511346</td>
      <td>0.511342</td>
    </tr>
    <tr>
      <th>25</th>
      <td>KNN</td>
      <td>bhutan</td>
      <td>0.599849</td>
      <td>0.610940</td>
      <td>0.599849</td>
      <td>0.589591</td>
    </tr>
    <tr>
      <th>26</th>
      <td>Random Forest</td>
      <td>bhutan</td>
      <td>0.701210</td>
      <td>0.715642</td>
      <td>0.701210</td>
      <td>0.696126</td>
    </tr>
    <tr>
      <th>27</th>
      <td>XGBoost</td>
      <td>bhutan</td>
      <td>0.647504</td>
      <td>0.652884</td>
      <td>0.647504</td>
      <td>0.644375</td>
    </tr>
    <tr>
      <th>28</th>
      <td>Logistic Regression</td>
      <td>botswana</td>
      <td>0.533088</td>
      <td>0.533122</td>
      <td>0.533088</td>
      <td>0.532970</td>
    </tr>
    <tr>
      <th>29</th>
      <td>KNN</td>
      <td>botswana</td>
      <td>0.574755</td>
      <td>0.580500</td>
      <td>0.574755</td>
      <td>0.567029</td>
    </tr>
    <tr>
      <th>30</th>
      <td>Random Forest</td>
      <td>botswana</td>
      <td>0.661765</td>
      <td>0.672103</td>
      <td>0.661765</td>
      <td>0.656608</td>
    </tr>
    <tr>
      <th>31</th>
      <td>XGBoost</td>
      <td>botswana</td>
      <td>0.609069</td>
      <td>0.610074</td>
      <td>0.609069</td>
      <td>0.608174</td>
    </tr>
    <tr>
      <th>32</th>
      <td>Logistic Regression</td>
      <td>chad</td>
      <td>0.510363</td>
      <td>0.510365</td>
      <td>0.510363</td>
      <td>0.510333</td>
    </tr>
    <tr>
      <th>33</th>
      <td>KNN</td>
      <td>chad</td>
      <td>0.551813</td>
      <td>0.557564</td>
      <td>0.551813</td>
      <td>0.540334</td>
    </tr>
    <tr>
      <th>34</th>
      <td>Random Forest</td>
      <td>chad</td>
      <td>0.652850</td>
      <td>0.653658</td>
      <td>0.652850</td>
      <td>0.652392</td>
    </tr>
    <tr>
      <th>35</th>
      <td>XGBoost</td>
      <td>chad</td>
      <td>0.554404</td>
      <td>0.554692</td>
      <td>0.554404</td>
      <td>0.553817</td>
    </tr>
    <tr>
      <th>36</th>
      <td>Logistic Regression</td>
      <td>comoros</td>
      <td>0.515660</td>
      <td>0.515660</td>
      <td>0.515660</td>
      <td>0.515659</td>
    </tr>
    <tr>
      <th>37</th>
      <td>KNN</td>
      <td>comoros</td>
      <td>0.554810</td>
      <td>0.555209</td>
      <td>0.554810</td>
      <td>0.554004</td>
    </tr>
    <tr>
      <th>38</th>
      <td>Random Forest</td>
      <td>comoros</td>
      <td>0.541387</td>
      <td>0.541955</td>
      <td>0.541387</td>
      <td>0.539830</td>
    </tr>
    <tr>
      <th>39</th>
      <td>XGBoost</td>
      <td>comoros</td>
      <td>0.549217</td>
      <td>0.549348</td>
      <td>0.549217</td>
      <td>0.548918</td>
    </tr>
    <tr>
      <th>40</th>
      <td>Logistic Regression</td>
      <td>ecuador</td>
      <td>0.546642</td>
      <td>0.546720</td>
      <td>0.546642</td>
      <td>0.546451</td>
    </tr>
    <tr>
      <th>41</th>
      <td>KNN</td>
      <td>ecuador</td>
      <td>0.611007</td>
      <td>0.619011</td>
      <td>0.611007</td>
      <td>0.604356</td>
    </tr>
    <tr>
      <th>42</th>
      <td>Random Forest</td>
      <td>ecuador</td>
      <td>0.666978</td>
      <td>0.676581</td>
      <td>0.666978</td>
      <td>0.662387</td>
    </tr>
    <tr>
      <th>43</th>
      <td>XGBoost</td>
      <td>ecuador</td>
      <td>0.619403</td>
      <td>0.621241</td>
      <td>0.619403</td>
      <td>0.617955</td>
    </tr>
    <tr>
      <th>44</th>
      <td>Logistic Regression</td>
      <td>eritrea</td>
      <td>0.517532</td>
      <td>0.517759</td>
      <td>0.517532</td>
      <td>0.515988</td>
    </tr>
    <tr>
      <th>45</th>
      <td>KNN</td>
      <td>eritrea</td>
      <td>0.612338</td>
      <td>0.623661</td>
      <td>0.612338</td>
      <td>0.603256</td>
    </tr>
    <tr>
      <th>46</th>
      <td>Random Forest</td>
      <td>eritrea</td>
      <td>0.681169</td>
      <td>0.685451</td>
      <td>0.681169</td>
      <td>0.679318</td>
    </tr>
    <tr>
      <th>47</th>
      <td>XGBoost</td>
      <td>eritrea</td>
      <td>0.639610</td>
      <td>0.643642</td>
      <td>0.639610</td>
      <td>0.637064</td>
    </tr>
    <tr>
      <th>48</th>
      <td>Logistic Regression</td>
      <td>eswatini</td>
      <td>0.524768</td>
      <td>0.525062</td>
      <td>0.524768</td>
      <td>0.523369</td>
    </tr>
    <tr>
      <th>49</th>
      <td>KNN</td>
      <td>eswatini</td>
      <td>0.586687</td>
      <td>0.591911</td>
      <td>0.586687</td>
      <td>0.580731</td>
    </tr>
    <tr>
      <th>50</th>
      <td>Random Forest</td>
      <td>eswatini</td>
      <td>0.660991</td>
      <td>0.663131</td>
      <td>0.660991</td>
      <td>0.659875</td>
    </tr>
    <tr>
      <th>51</th>
      <td>XGBoost</td>
      <td>eswatini</td>
      <td>0.645511</td>
      <td>0.645915</td>
      <td>0.645511</td>
      <td>0.645265</td>
    </tr>
    <tr>
      <th>52</th>
      <td>Logistic Regression</td>
      <td>ethiopia</td>
      <td>0.540784</td>
      <td>0.541488</td>
      <td>0.540784</td>
      <td>0.538827</td>
    </tr>
    <tr>
      <th>53</th>
      <td>KNN</td>
      <td>ethiopia</td>
      <td>0.518538</td>
      <td>0.518632</td>
      <td>0.518538</td>
      <td>0.517931</td>
    </tr>
    <tr>
      <th>54</th>
      <td>Random Forest</td>
      <td>ethiopia</td>
      <td>0.557733</td>
      <td>0.557744</td>
      <td>0.557733</td>
      <td>0.557712</td>
    </tr>
    <tr>
      <th>55</th>
      <td>XGBoost</td>
      <td>ethiopia</td>
      <td>0.525953</td>
      <td>0.526048</td>
      <td>0.525953</td>
      <td>0.525521</td>
    </tr>
    <tr>
      <th>56</th>
      <td>Logistic Regression</td>
      <td>fiji</td>
      <td>0.580386</td>
      <td>0.581091</td>
      <td>0.580386</td>
      <td>0.579472</td>
    </tr>
    <tr>
      <th>57</th>
      <td>KNN</td>
      <td>fiji</td>
      <td>0.623794</td>
      <td>0.647194</td>
      <td>0.623794</td>
      <td>0.608224</td>
    </tr>
    <tr>
      <th>58</th>
      <td>Random Forest</td>
      <td>fiji</td>
      <td>0.744373</td>
      <td>0.745962</td>
      <td>0.744373</td>
      <td>0.743959</td>
    </tr>
    <tr>
      <th>59</th>
      <td>XGBoost</td>
      <td>fiji</td>
      <td>0.681672</td>
      <td>0.681740</td>
      <td>0.681672</td>
      <td>0.681642</td>
    </tr>
    <tr>
      <th>60</th>
      <td>Logistic Regression</td>
      <td>gabon</td>
      <td>0.603053</td>
      <td>0.608364</td>
      <td>0.603053</td>
      <td>0.598130</td>
    </tr>
    <tr>
      <th>61</th>
      <td>KNN</td>
      <td>gabon</td>
      <td>0.627863</td>
      <td>0.635709</td>
      <td>0.627863</td>
      <td>0.622404</td>
    </tr>
    <tr>
      <th>62</th>
      <td>Random Forest</td>
      <td>gabon</td>
      <td>0.719466</td>
      <td>0.725541</td>
      <td>0.719466</td>
      <td>0.717564</td>
    </tr>
    <tr>
      <th>63</th>
      <td>XGBoost</td>
      <td>gabon</td>
      <td>0.648855</td>
      <td>0.648994</td>
      <td>0.648855</td>
      <td>0.648773</td>
    </tr>
    <tr>
      <th>64</th>
      <td>Logistic Regression</td>
      <td>gambia</td>
      <td>0.509975</td>
      <td>0.509976</td>
      <td>0.509975</td>
      <td>0.509968</td>
    </tr>
    <tr>
      <th>65</th>
      <td>KNN</td>
      <td>gambia</td>
      <td>0.605985</td>
      <td>0.622182</td>
      <td>0.605985</td>
      <td>0.592480</td>
    </tr>
    <tr>
      <th>66</th>
      <td>Random Forest</td>
      <td>gambia</td>
      <td>0.729426</td>
      <td>0.739186</td>
      <td>0.729426</td>
      <td>0.726638</td>
    </tr>
    <tr>
      <th>67</th>
      <td>XGBoost</td>
      <td>gambia</td>
      <td>0.698254</td>
      <td>0.700453</td>
      <td>0.698254</td>
      <td>0.697425</td>
    </tr>
    <tr>
      <th>68</th>
      <td>Logistic Regression</td>
      <td>georgia</td>
      <td>0.551619</td>
      <td>0.551621</td>
      <td>0.551619</td>
      <td>0.551615</td>
    </tr>
    <tr>
      <th>69</th>
      <td>KNN</td>
      <td>georgia</td>
      <td>0.646761</td>
      <td>0.670105</td>
      <td>0.646761</td>
      <td>0.634212</td>
    </tr>
    <tr>
      <th>70</th>
      <td>Random Forest</td>
      <td>georgia</td>
      <td>0.753036</td>
      <td>0.775827</td>
      <td>0.753036</td>
      <td>0.747827</td>
    </tr>
    <tr>
      <th>71</th>
      <td>XGBoost</td>
      <td>georgia</td>
      <td>0.710526</td>
      <td>0.716196</td>
      <td>0.710526</td>
      <td>0.708616</td>
    </tr>
    <tr>
      <th>72</th>
      <td>Logistic Regression</td>
      <td>grenada</td>
      <td>0.564815</td>
      <td>0.565172</td>
      <td>0.564815</td>
      <td>0.564217</td>
    </tr>
    <tr>
      <th>73</th>
      <td>KNN</td>
      <td>grenada</td>
      <td>0.583333</td>
      <td>0.586293</td>
      <td>0.583333</td>
      <td>0.579730</td>
    </tr>
    <tr>
      <th>74</th>
      <td>Random Forest</td>
      <td>grenada</td>
      <td>0.638889</td>
      <td>0.638937</td>
      <td>0.638889</td>
      <td>0.638858</td>
    </tr>
    <tr>
      <th>75</th>
      <td>XGBoost</td>
      <td>grenada</td>
      <td>0.634259</td>
      <td>0.634271</td>
      <td>0.634259</td>
      <td>0.634251</td>
    </tr>
    <tr>
      <th>76</th>
      <td>Logistic Regression</td>
      <td>guyana</td>
      <td>0.530888</td>
      <td>0.530911</td>
      <td>0.530888</td>
      <td>0.530802</td>
    </tr>
    <tr>
      <th>77</th>
      <td>KNN</td>
      <td>guyana</td>
      <td>0.496139</td>
      <td>0.496090</td>
      <td>0.496139</td>
      <td>0.494555</td>
    </tr>
    <tr>
      <th>78</th>
      <td>Random Forest</td>
      <td>guyana</td>
      <td>0.542471</td>
      <td>0.542502</td>
      <td>0.542471</td>
      <td>0.542387</td>
    </tr>
    <tr>
      <th>79</th>
      <td>XGBoost</td>
      <td>guyana</td>
      <td>0.532819</td>
      <td>0.532819</td>
      <td>0.532819</td>
      <td>0.532819</td>
    </tr>
    <tr>
      <th>80</th>
      <td>Logistic Regression</td>
      <td>lesotho</td>
      <td>0.562500</td>
      <td>0.562557</td>
      <td>0.562500</td>
      <td>0.562400</td>
    </tr>
    <tr>
      <th>81</th>
      <td>KNN</td>
      <td>lesotho</td>
      <td>0.609914</td>
      <td>0.644099</td>
      <td>0.609914</td>
      <td>0.585319</td>
    </tr>
    <tr>
      <th>82</th>
      <td>Random Forest</td>
      <td>lesotho</td>
      <td>0.745690</td>
      <td>0.763300</td>
      <td>0.745690</td>
      <td>0.741365</td>
    </tr>
    <tr>
      <th>83</th>
      <td>XGBoost</td>
      <td>lesotho</td>
      <td>0.709052</td>
      <td>0.713369</td>
      <td>0.709052</td>
      <td>0.707573</td>
    </tr>
    <tr>
      <th>84</th>
      <td>Logistic Regression</td>
      <td>liberia</td>
      <td>0.603448</td>
      <td>0.606618</td>
      <td>0.603448</td>
      <td>0.600479</td>
    </tr>
    <tr>
      <th>85</th>
      <td>KNN</td>
      <td>liberia</td>
      <td>0.603448</td>
      <td>0.609848</td>
      <td>0.603448</td>
      <td>0.597587</td>
    </tr>
    <tr>
      <th>86</th>
      <td>Random Forest</td>
      <td>liberia</td>
      <td>0.637931</td>
      <td>0.646465</td>
      <td>0.637931</td>
      <td>0.632579</td>
    </tr>
    <tr>
      <th>87</th>
      <td>XGBoost</td>
      <td>liberia</td>
      <td>0.724138</td>
      <td>0.728485</td>
      <td>0.724138</td>
      <td>0.722820</td>
    </tr>
    <tr>
      <th>88</th>
      <td>Logistic Regression</td>
      <td>libya</td>
      <td>0.562648</td>
      <td>0.563008</td>
      <td>0.562648</td>
      <td>0.562021</td>
    </tr>
    <tr>
      <th>89</th>
      <td>KNN</td>
      <td>libya</td>
      <td>0.613475</td>
      <td>0.635642</td>
      <td>0.613475</td>
      <td>0.597011</td>
    </tr>
    <tr>
      <th>90</th>
      <td>Random Forest</td>
      <td>libya</td>
      <td>0.758865</td>
      <td>0.771140</td>
      <td>0.758865</td>
      <td>0.756105</td>
    </tr>
    <tr>
      <th>91</th>
      <td>XGBoost</td>
      <td>libya</td>
      <td>0.721040</td>
      <td>0.727635</td>
      <td>0.721040</td>
      <td>0.719005</td>
    </tr>
    <tr>
      <th>92</th>
      <td>Logistic Regression</td>
      <td>madagascar</td>
      <td>0.560790</td>
      <td>0.561089</td>
      <td>0.560790</td>
      <td>0.560253</td>
    </tr>
    <tr>
      <th>93</th>
      <td>KNN</td>
      <td>madagascar</td>
      <td>0.602584</td>
      <td>0.612684</td>
      <td>0.602584</td>
      <td>0.593474</td>
    </tr>
    <tr>
      <th>94</th>
      <td>Random Forest</td>
      <td>madagascar</td>
      <td>0.724924</td>
      <td>0.734060</td>
      <td>0.724924</td>
      <td>0.722213</td>
    </tr>
    <tr>
      <th>95</th>
      <td>XGBoost</td>
      <td>madagascar</td>
      <td>0.678571</td>
      <td>0.682211</td>
      <td>0.678571</td>
      <td>0.676958</td>
    </tr>
    <tr>
      <th>96</th>
      <td>Logistic Regression</td>
      <td>malawi</td>
      <td>0.548556</td>
      <td>0.548719</td>
      <td>0.548556</td>
      <td>0.548180</td>
    </tr>
    <tr>
      <th>97</th>
      <td>KNN</td>
      <td>malawi</td>
      <td>0.577428</td>
      <td>0.583156</td>
      <td>0.577428</td>
      <td>0.570023</td>
    </tr>
    <tr>
      <th>98</th>
      <td>Random Forest</td>
      <td>malawi</td>
      <td>0.646982</td>
      <td>0.647617</td>
      <td>0.646982</td>
      <td>0.646601</td>
    </tr>
    <tr>
      <th>99</th>
      <td>XGBoost</td>
      <td>malawi</td>
      <td>0.625984</td>
      <td>0.626027</td>
      <td>0.625984</td>
      <td>0.625953</td>
    </tr>
    <tr>
      <th>100</th>
      <td>Logistic Regression</td>
      <td>maldives</td>
      <td>0.523333</td>
      <td>0.523569</td>
      <td>0.523333</td>
      <td>0.522139</td>
    </tr>
    <tr>
      <th>101</th>
      <td>KNN</td>
      <td>maldives</td>
      <td>0.530000</td>
      <td>0.531165</td>
      <td>0.530000</td>
      <td>0.525567</td>
    </tr>
    <tr>
      <th>102</th>
      <td>Random Forest</td>
      <td>maldives</td>
      <td>0.550000</td>
      <td>0.550056</td>
      <td>0.550000</td>
      <td>0.549875</td>
    </tr>
    <tr>
      <th>103</th>
      <td>XGBoost</td>
      <td>maldives</td>
      <td>0.530000</td>
      <td>0.530303</td>
      <td>0.530000</td>
      <td>0.528822</td>
    </tr>
    <tr>
      <th>104</th>
      <td>Logistic Regression</td>
      <td>mali</td>
      <td>0.579167</td>
      <td>0.579216</td>
      <td>0.579167</td>
      <td>0.579101</td>
    </tr>
    <tr>
      <th>105</th>
      <td>KNN</td>
      <td>mali</td>
      <td>0.554167</td>
      <td>0.567294</td>
      <td>0.554167</td>
      <td>0.531310</td>
    </tr>
    <tr>
      <th>106</th>
      <td>Random Forest</td>
      <td>mali</td>
      <td>0.716667</td>
      <td>0.719656</td>
      <td>0.716667</td>
      <td>0.715699</td>
    </tr>
    <tr>
      <th>107</th>
      <td>XGBoost</td>
      <td>mali</td>
      <td>0.700000</td>
      <td>0.700893</td>
      <td>0.700000</td>
      <td>0.699666</td>
    </tr>
    <tr>
      <th>108</th>
      <td>Logistic Regression</td>
      <td>micronesia</td>
      <td>0.504274</td>
      <td>0.504319</td>
      <td>0.504274</td>
      <td>0.502966</td>
    </tr>
    <tr>
      <th>109</th>
      <td>KNN</td>
      <td>micronesia</td>
      <td>0.512821</td>
      <td>0.513384</td>
      <td>0.512821</td>
      <td>0.507641</td>
    </tr>
    <tr>
      <th>110</th>
      <td>Random Forest</td>
      <td>micronesia</td>
      <td>0.461538</td>
      <td>0.461358</td>
      <td>0.461538</td>
      <td>0.460908</td>
    </tr>
    <tr>
      <th>111</th>
      <td>XGBoost</td>
      <td>micronesia</td>
      <td>0.487179</td>
      <td>0.487146</td>
      <td>0.487179</td>
      <td>0.486842</td>
    </tr>
    <tr>
      <th>112</th>
      <td>Logistic Regression</td>
      <td>moldova</td>
      <td>0.572310</td>
      <td>0.572769</td>
      <td>0.572310</td>
      <td>0.571636</td>
    </tr>
    <tr>
      <th>113</th>
      <td>KNN</td>
      <td>moldova</td>
      <td>0.706349</td>
      <td>0.769820</td>
      <td>0.706349</td>
      <td>0.688001</td>
    </tr>
    <tr>
      <th>114</th>
      <td>Random Forest</td>
      <td>moldova</td>
      <td>0.876543</td>
      <td>0.884993</td>
      <td>0.876543</td>
      <td>0.875862</td>
    </tr>
    <tr>
      <th>115</th>
      <td>XGBoost</td>
      <td>moldova</td>
      <td>0.832451</td>
      <td>0.843570</td>
      <td>0.832451</td>
      <td>0.831085</td>
    </tr>
    <tr>
      <th>116</th>
      <td>Logistic Regression</td>
      <td>mongolia</td>
      <td>0.503717</td>
      <td>0.503718</td>
      <td>0.503717</td>
      <td>0.503690</td>
    </tr>
    <tr>
      <th>117</th>
      <td>KNN</td>
      <td>mongolia</td>
      <td>0.516729</td>
      <td>0.516762</td>
      <td>0.516729</td>
      <td>0.516488</td>
    </tr>
    <tr>
      <th>118</th>
      <td>Random Forest</td>
      <td>mongolia</td>
      <td>0.520446</td>
      <td>0.520460</td>
      <td>0.520446</td>
      <td>0.520365</td>
    </tr>
    <tr>
      <th>119</th>
      <td>XGBoost</td>
      <td>mongolia</td>
      <td>0.513941</td>
      <td>0.513954</td>
      <td>0.513941</td>
      <td>0.513819</td>
    </tr>
    <tr>
      <th>120</th>
      <td>Logistic Regression</td>
      <td>mozambique</td>
      <td>0.564220</td>
      <td>0.564394</td>
      <td>0.564220</td>
      <td>0.563926</td>
    </tr>
    <tr>
      <th>121</th>
      <td>KNN</td>
      <td>mozambique</td>
      <td>0.634557</td>
      <td>0.651376</td>
      <td>0.634557</td>
      <td>0.624115</td>
    </tr>
    <tr>
      <th>122</th>
      <td>Random Forest</td>
      <td>mozambique</td>
      <td>0.700306</td>
      <td>0.711744</td>
      <td>0.700306</td>
      <td>0.696203</td>
    </tr>
    <tr>
      <th>123</th>
      <td>XGBoost</td>
      <td>mozambique</td>
      <td>0.623853</td>
      <td>0.625373</td>
      <td>0.623853</td>
      <td>0.622710</td>
    </tr>
    <tr>
      <th>124</th>
      <td>Logistic Regression</td>
      <td>myanmar</td>
      <td>0.568780</td>
      <td>0.568998</td>
      <td>0.568780</td>
      <td>0.568439</td>
    </tr>
    <tr>
      <th>125</th>
      <td>KNN</td>
      <td>myanmar</td>
      <td>0.572967</td>
      <td>0.581049</td>
      <td>0.572967</td>
      <td>0.562048</td>
    </tr>
    <tr>
      <th>126</th>
      <td>Random Forest</td>
      <td>myanmar</td>
      <td>0.651316</td>
      <td>0.661199</td>
      <td>0.651316</td>
      <td>0.645888</td>
    </tr>
    <tr>
      <th>127</th>
      <td>XGBoost</td>
      <td>myanmar</td>
      <td>0.609450</td>
      <td>0.612654</td>
      <td>0.609450</td>
      <td>0.606652</td>
    </tr>
    <tr>
      <th>128</th>
      <td>Logistic Regression</td>
      <td>namibia</td>
      <td>0.541371</td>
      <td>0.541386</td>
      <td>0.541371</td>
      <td>0.541330</td>
    </tr>
    <tr>
      <th>129</th>
      <td>KNN</td>
      <td>namibia</td>
      <td>0.630024</td>
      <td>0.655423</td>
      <td>0.630024</td>
      <td>0.614264</td>
    </tr>
    <tr>
      <th>130</th>
      <td>Random Forest</td>
      <td>namibia</td>
      <td>0.750591</td>
      <td>0.785011</td>
      <td>0.750591</td>
      <td>0.742826</td>
    </tr>
    <tr>
      <th>131</th>
      <td>XGBoost</td>
      <td>namibia</td>
      <td>0.693853</td>
      <td>0.700145</td>
      <td>0.693853</td>
      <td>0.691428</td>
    </tr>
    <tr>
      <th>132</th>
      <td>Logistic Regression</td>
      <td>nauru</td>
      <td>0.571429</td>
      <td>0.571429</td>
      <td>0.571429</td>
      <td>0.571429</td>
    </tr>
    <tr>
      <th>133</th>
      <td>KNN</td>
      <td>nauru</td>
      <td>0.584821</td>
      <td>0.587335</td>
      <td>0.584821</td>
      <td>0.581813</td>
    </tr>
    <tr>
      <th>134</th>
      <td>Random Forest</td>
      <td>nauru</td>
      <td>0.602679</td>
      <td>0.602884</td>
      <td>0.602679</td>
      <td>0.602481</td>
    </tr>
    <tr>
      <th>135</th>
      <td>XGBoost</td>
      <td>nauru</td>
      <td>0.620536</td>
      <td>0.621008</td>
      <td>0.620536</td>
      <td>0.620165</td>
    </tr>
    <tr>
      <th>136</th>
      <td>Logistic Regression</td>
      <td>nepal</td>
      <td>0.537219</td>
      <td>0.537572</td>
      <td>0.537219</td>
      <td>0.536130</td>
    </tr>
    <tr>
      <th>137</th>
      <td>KNN</td>
      <td>nepal</td>
      <td>0.606742</td>
      <td>0.624669</td>
      <td>0.606742</td>
      <td>0.592076</td>
    </tr>
    <tr>
      <th>138</th>
      <td>Random Forest</td>
      <td>nepal</td>
      <td>0.733146</td>
      <td>0.752687</td>
      <td>0.733146</td>
      <td>0.727885</td>
    </tr>
    <tr>
      <th>139</th>
      <td>XGBoost</td>
      <td>nepal</td>
      <td>0.684691</td>
      <td>0.693879</td>
      <td>0.684691</td>
      <td>0.680910</td>
    </tr>
    <tr>
      <th>140</th>
      <td>Logistic Regression</td>
      <td>niger</td>
      <td>0.523891</td>
      <td>0.523891</td>
      <td>0.523891</td>
      <td>0.523889</td>
    </tr>
    <tr>
      <th>141</th>
      <td>KNN</td>
      <td>niger</td>
      <td>0.634812</td>
      <td>0.648179</td>
      <td>0.634812</td>
      <td>0.626387</td>
    </tr>
    <tr>
      <th>142</th>
      <td>Random Forest</td>
      <td>niger</td>
      <td>0.709898</td>
      <td>0.713489</td>
      <td>0.709898</td>
      <td>0.708673</td>
    </tr>
    <tr>
      <th>143</th>
      <td>XGBoost</td>
      <td>niger</td>
      <td>0.711604</td>
      <td>0.715830</td>
      <td>0.711604</td>
      <td>0.710185</td>
    </tr>
    <tr>
      <th>144</th>
      <td>Logistic Regression</td>
      <td>palau</td>
      <td>0.503521</td>
      <td>0.503535</td>
      <td>0.503521</td>
      <td>0.503022</td>
    </tr>
    <tr>
      <th>145</th>
      <td>KNN</td>
      <td>palau</td>
      <td>0.524648</td>
      <td>0.524659</td>
      <td>0.524648</td>
      <td>0.524595</td>
    </tr>
    <tr>
      <th>146</th>
      <td>Random Forest</td>
      <td>palau</td>
      <td>0.538732</td>
      <td>0.538750</td>
      <td>0.538732</td>
      <td>0.538681</td>
    </tr>
    <tr>
      <th>147</th>
      <td>XGBoost</td>
      <td>palau</td>
      <td>0.514085</td>
      <td>0.514431</td>
      <td>0.514085</td>
      <td>0.511151</td>
    </tr>
    <tr>
      <th>148</th>
      <td>Logistic Regression</td>
      <td>palestine</td>
      <td>0.543151</td>
      <td>0.543434</td>
      <td>0.543151</td>
      <td>0.542403</td>
    </tr>
    <tr>
      <th>149</th>
      <td>KNN</td>
      <td>palestine</td>
      <td>0.558219</td>
      <td>0.559754</td>
      <td>0.558219</td>
      <td>0.555364</td>
    </tr>
    <tr>
      <th>150</th>
      <td>Random Forest</td>
      <td>palestine</td>
      <td>0.619178</td>
      <td>0.623207</td>
      <td>0.619178</td>
      <td>0.616040</td>
    </tr>
    <tr>
      <th>151</th>
      <td>XGBoost</td>
      <td>palestine</td>
      <td>0.582877</td>
      <td>0.584304</td>
      <td>0.582877</td>
      <td>0.581103</td>
    </tr>
    <tr>
      <th>152</th>
      <td>Logistic Regression</td>
      <td>qatar</td>
      <td>0.469767</td>
      <td>0.469702</td>
      <td>0.469767</td>
      <td>0.469481</td>
    </tr>
    <tr>
      <th>153</th>
      <td>KNN</td>
      <td>qatar</td>
      <td>0.541860</td>
      <td>0.541869</td>
      <td>0.541860</td>
      <td>0.541838</td>
    </tr>
    <tr>
      <th>154</th>
      <td>Random Forest</td>
      <td>qatar</td>
      <td>0.520930</td>
      <td>0.520976</td>
      <td>0.520930</td>
      <td>0.520671</td>
    </tr>
    <tr>
      <th>155</th>
      <td>XGBoost</td>
      <td>qatar</td>
      <td>0.523256</td>
      <td>0.523341</td>
      <td>0.523256</td>
      <td>0.522820</td>
    </tr>
    <tr>
      <th>156</th>
      <td>Logistic Regression</td>
      <td>tanzania</td>
      <td>0.560995</td>
      <td>0.561474</td>
      <td>0.560995</td>
      <td>0.560138</td>
    </tr>
    <tr>
      <th>157</th>
      <td>KNN</td>
      <td>tanzania</td>
      <td>0.574639</td>
      <td>0.582838</td>
      <td>0.574639</td>
      <td>0.563847</td>
    </tr>
    <tr>
      <th>158</th>
      <td>Random Forest</td>
      <td>tanzania</td>
      <td>0.698234</td>
      <td>0.706689</td>
      <td>0.698234</td>
      <td>0.695117</td>
    </tr>
    <tr>
      <th>159</th>
      <td>XGBoost</td>
      <td>tanzania</td>
      <td>0.637239</td>
      <td>0.640577</td>
      <td>0.637239</td>
      <td>0.635073</td>
    </tr>
    <tr>
      <th>160</th>
      <td>Logistic Regression</td>
      <td>togo</td>
      <td>0.537760</td>
      <td>0.537767</td>
      <td>0.537760</td>
      <td>0.537741</td>
    </tr>
    <tr>
      <th>161</th>
      <td>KNN</td>
      <td>togo</td>
      <td>0.516927</td>
      <td>0.517459</td>
      <td>0.516927</td>
      <td>0.513222</td>
    </tr>
    <tr>
      <th>162</th>
      <td>Random Forest</td>
      <td>togo</td>
      <td>0.536458</td>
      <td>0.536459</td>
      <td>0.536458</td>
      <td>0.536455</td>
    </tr>
    <tr>
      <th>163</th>
      <td>XGBoost</td>
      <td>togo</td>
      <td>0.535156</td>
      <td>0.535523</td>
      <td>0.535156</td>
      <td>0.533954</td>
    </tr>
    <tr>
      <th>164</th>
      <td>Logistic Regression</td>
      <td>tokelau</td>
      <td>0.556452</td>
      <td>0.557181</td>
      <td>0.556452</td>
      <td>0.555034</td>
    </tr>
    <tr>
      <th>165</th>
      <td>KNN</td>
      <td>tokelau</td>
      <td>0.532258</td>
      <td>0.533120</td>
      <td>0.532258</td>
      <td>0.529196</td>
    </tr>
    <tr>
      <th>166</th>
      <td>Random Forest</td>
      <td>tokelau</td>
      <td>0.629032</td>
      <td>0.629572</td>
      <td>0.629032</td>
      <td>0.628646</td>
    </tr>
    <tr>
      <th>167</th>
      <td>XGBoost</td>
      <td>tokelau</td>
      <td>0.645161</td>
      <td>0.645312</td>
      <td>0.645161</td>
      <td>0.645069</td>
    </tr>
    <tr>
      <th>168</th>
      <td>Logistic Regression</td>
      <td>tonga</td>
      <td>0.511928</td>
      <td>0.512171</td>
      <td>0.511928</td>
      <td>0.509485</td>
    </tr>
    <tr>
      <th>169</th>
      <td>KNN</td>
      <td>tonga</td>
      <td>0.652087</td>
      <td>0.671700</td>
      <td>0.652087</td>
      <td>0.641860</td>
    </tr>
    <tr>
      <th>170</th>
      <td>Random Forest</td>
      <td>tonga</td>
      <td>0.773360</td>
      <td>0.776088</td>
      <td>0.773360</td>
      <td>0.772799</td>
    </tr>
    <tr>
      <th>171</th>
      <td>XGBoost</td>
      <td>tonga</td>
      <td>0.738569</td>
      <td>0.744920</td>
      <td>0.738569</td>
      <td>0.736863</td>
    </tr>
    <tr>
      <th>172</th>
      <td>Logistic Regression</td>
      <td>tuvalu</td>
      <td>0.574830</td>
      <td>0.575000</td>
      <td>0.574830</td>
      <td>0.574589</td>
    </tr>
    <tr>
      <th>173</th>
      <td>KNN</td>
      <td>tuvalu</td>
      <td>0.632653</td>
      <td>0.674521</td>
      <td>0.632653</td>
      <td>0.609216</td>
    </tr>
    <tr>
      <th>174</th>
      <td>Random Forest</td>
      <td>tuvalu</td>
      <td>0.840136</td>
      <td>0.852011</td>
      <td>0.840136</td>
      <td>0.838776</td>
    </tr>
    <tr>
      <th>175</th>
      <td>XGBoost</td>
      <td>tuvalu</td>
      <td>0.765306</td>
      <td>0.768098</td>
      <td>0.765306</td>
      <td>0.764694</td>
    </tr>
    <tr>
      <th>176</th>
      <td>Logistic Regression</td>
      <td>uganda</td>
      <td>0.516355</td>
      <td>0.516425</td>
      <td>0.516355</td>
      <td>0.515837</td>
    </tr>
    <tr>
      <th>177</th>
      <td>KNN</td>
      <td>uganda</td>
      <td>0.591121</td>
      <td>0.595740</td>
      <td>0.591121</td>
      <td>0.586131</td>
    </tr>
    <tr>
      <th>178</th>
      <td>Random Forest</td>
      <td>uganda</td>
      <td>0.663551</td>
      <td>0.668591</td>
      <td>0.663551</td>
      <td>0.661018</td>
    </tr>
    <tr>
      <th>179</th>
      <td>XGBoost</td>
      <td>uganda</td>
      <td>0.648364</td>
      <td>0.649607</td>
      <td>0.648364</td>
      <td>0.647633</td>
    </tr>
    <tr>
      <th>180</th>
      <td>Logistic Regression</td>
      <td>vanuatu</td>
      <td>0.552281</td>
      <td>0.552291</td>
      <td>0.552281</td>
      <td>0.552262</td>
    </tr>
    <tr>
      <th>181</th>
      <td>KNN</td>
      <td>vanuatu</td>
      <td>0.569392</td>
      <td>0.578947</td>
      <td>0.569392</td>
      <td>0.555955</td>
    </tr>
    <tr>
      <th>182</th>
      <td>Random Forest</td>
      <td>vanuatu</td>
      <td>0.689163</td>
      <td>0.695268</td>
      <td>0.689163</td>
      <td>0.686715</td>
    </tr>
    <tr>
      <th>183</th>
      <td>XGBoost</td>
      <td>vanuatu</td>
      <td>0.631179</td>
      <td>0.631452</td>
      <td>0.631179</td>
      <td>0.630987</td>
    </tr>
    <tr>
      <th>184</th>
      <td>Logistic Regression</td>
      <td>zambia</td>
      <td>0.530986</td>
      <td>0.530988</td>
      <td>0.530986</td>
      <td>0.530978</td>
    </tr>
    <tr>
      <th>185</th>
      <td>KNN</td>
      <td>zambia</td>
      <td>0.528169</td>
      <td>0.528629</td>
      <td>0.528169</td>
      <td>0.526266</td>
    </tr>
    <tr>
      <th>186</th>
      <td>Random Forest</td>
      <td>zambia</td>
      <td>0.594366</td>
      <td>0.594558</td>
      <td>0.594366</td>
      <td>0.594160</td>
    </tr>
    <tr>
      <th>187</th>
      <td>XGBoost</td>
      <td>zambia</td>
      <td>0.569014</td>
      <td>0.569034</td>
      <td>0.569014</td>
      <td>0.568983</td>
    </tr>
  </tbody>
</table>
</div>

## Per Region Model

{'Sub-Saharan Africa': 20,  
 'East Asia and Pacific': 9,  
 'South Asia': 3,  
 'Middle East and North Africa': 3,  
 'Europe and Central Asia': 4,  
 'Latin America and the Caribbean': 5}

In [49]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),  # Increased max_iter for convergence
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)  # 'verbose=0' to silence CatBoost training output
}

# Results table
results = pd.DataFrame(columns=['Model', 'Region', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Run data preprocess
X_train, X_test, y_train_encoded, y_test_encoded = run_preprocess(processed_data)

# Creating a dictionary based on the World Bank regional grouping using the provided country names
wb_regions = {
    "Sub-Saharan Africa": [
        "benin", "botswana", "chad", "comoros", "eswatini", "eritrea", "ethiopia", 
        "gabon", "gambia", "lesotho", "liberia", "madagascar", "malawi", 
        "mali", "mozambique", "namibia", "niger", "tanzania", "togo", 
        "uganda", "zambia"
    ],
    "East Asia and Pacific": [
        "fiji", "micronesia", "mongolia", "nauru", "palau", "tokelau",
        "tonga", "tuvalu", "vanuatu", "myanmar"
    ],
    "South Asia": [
        "bangladesh", "bhutan", "maldives", "nepal"
    ],
    "Middle East and North Africa": [
        "libya", "qatar", "palestine"
    ],
    "Europe and Central Asia": [
        "belarus", "georgia", "moldova", "azerbaijan"
    ],
    "Latin America and the Caribbean": [
        "bahamas", "barbados", "ecuador", "grenada", "guyana"
    ]
}

# Train and evaluate models per region
for region, countries in wb_regions.items():
    print(f"Training region: {region}")
    # Subset data for the current region
    X_train_region = X_train[X_train['country'].isin(countries)].drop('country', axis=1)
    y_train_region = y_train_encoded[X_train['country'].isin(countries)]
    X_test_region = X_test[X_test['country'].isin(countries)].drop('country', axis=1)
    y_test_region = y_test_encoded[X_test['country'].isin(countries)]

    if len(X_train_region) == 0 or len(X_test_region) == 0:
        print(region)
        continue  # Skip if no data for this region

    for name, model in models.items():
        # Train the model for the current region
        model.fit(X_train_region, y_train_region)
        y_pred_region = model.predict(X_test_region)

        # Compute metrics for the current region
        accuracy = accuracy_score(y_test_region, y_pred_region)
        precision = precision_score(y_test_region, y_pred_region, average='macro', zero_division=0)
        recall = recall_score(y_test_region, y_pred_region, average='macro', zero_division=0)
        f1 = f1_score(y_test_region, y_pred_region, average='macro', zero_division=0)

        # Add results to the table
        results = results.append({
            'Model': name,
            'Region': region,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        }, ignore_index=True)

Skipping group samoa as it has no samples.
Skipping group afghanistan as it has no samples.
Skipping group ghana as it has no samples.
Skipping group rwanda as it has no samples.
Skipping group niue as it has no samples.
Skipping group armenia as it has no samples.
Skipping group kiribati as it has no samples.
Skipping group algeria as it has no samples.
Skipping group kyrgyzstan as it has no samples.
Skipping group guinea as it has no samples.
Training region: Sub-Saharan Africa
Training region: East Asia and Pacific
Training region: South Asia
Training region: Middle East and North Africa
Training region: Europe and Central Asia
Training region: Latin America and the Caribbean


In [50]:
results

Unnamed: 0,Model,Region,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,Sub-Saharan Africa,0.518997,0.519,0.518997,0.518977
1,KNN,Sub-Saharan Africa,0.579261,0.580649,0.579261,0.577444
2,Random Forest,Sub-Saharan Africa,0.64765,0.653472,0.64765,0.644277
3,XGBoost,Sub-Saharan Africa,0.604571,0.609467,0.604571,0.6001
4,Logistic Regression,East Asia and Pacific,0.530206,0.530933,0.530206,0.52743
5,KNN,East Asia and Pacific,0.577717,0.582384,0.577717,0.571651
6,Random Forest,East Asia and Pacific,0.643291,0.650564,0.643291,0.63893
7,XGBoost,East Asia and Pacific,0.618245,0.622779,0.618245,0.614688
8,Logistic Regression,South Asia,0.509031,0.509044,0.509031,0.50885
9,KNN,South Asia,0.563436,0.566104,0.563436,0.558987


In [45]:
print(['city', 'town'] + ['region', 'state'])

['city', 'town', 'region', 'state']
