# World Health Statistics
### Merged according to category similarity

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import warnings
from itertools import product
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore", ConvergenceWarning)

In [2]:
import os

# Define the directory path
directory = r"C:\Users\Ahmad1\Documents\GitHub\ie423-2024-termproject-the-a-team\DataSets\WorldHealth"

# Define the filenames for each group of CSV files
mortality_files = ["infantMortalityRate.csv", "under5MortalityRate.csv", "neonatalMortalityRate.csv", "mortalityRatePoisoning.csv", "crudeSuicideRates.csv", "30-70cancerChdEtc.csv"]
life_expectancy_files = ["lifeExpectancyAtBirth.csv", "WHOregionLifeExpectancyAtBirth.csv", "HALeWHOregionLifeExpectancyAtBirth.csv", "HALElifeExpectancyAtBirth.csv", "%HaleInLifeExpectancy.csv"]
health_services_files = ["basicDrinkingWaterServices.csv", "atLeastBasicSanitizationServices.csv", "safelySanitization.csv", "basicHandWashing.csv", "birthAttendedBySkilledPersonal.csv"]
disease_incidence_files = ["interventionAgianstNTDs.csv", "incedenceOfMalaria.csv", "incedenceOfTuberculosis.csv", "hepatitusBsurfaceAntigen.csv", "newHivInfections.csv"]
maternal_health_files = ["maternalMortalityRatio.csv"]

# Function to merge CSV files
def merge_csv_files(files):
    dfs = []
    for file in files:
        filepath = os.path.join(directory, file)
        if os.path.exists(filepath):
            df = pd.read_csv(filepath)
            dfs.append(df)
        else:
            print(f"File '{file}' not found.")
    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)
        return merged_df
    else:
        return None

# Merge mortality files
merged_mortality = merge_csv_files(mortality_files)

# Merge life expectancy files
merged_life_expectancy = merge_csv_files(life_expectancy_files)

# Merge health services files
merged_health_services = merge_csv_files(health_services_files)

# Merge disease incidence files
merged_disease_incidence = merge_csv_files(disease_incidence_files)

# Merge maternal health files
merged_maternal_health = merge_csv_files(maternal_health_files)


In [3]:
# List all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Loop through each CSV file
for file in csv_files:
    # Construct the full path to the CSV file
    file_path = os.path.join(directory, file)
    
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Display basic information about the DataFrame
    print(f"Information about {file}:")
    print(df.info())
    print("\nSummary statistics:")
    print(df.describe())
    print("\nSample data:")
    print(df.head())
    print("\n\n")


Information about %HaleInLifeExpectancy.csv:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Location                      72 non-null     object 
 1   Period                        72 non-null     int64  
 2   Indicator                     72 non-null     object 
 3   Dim1                          72 non-null     object 
 4   Hale Expectency               72 non-null     float64
 5   Life expectany                72 non-null     float64
 6   Unnamed: 6                    72 non-null     float64
 7   % OF HALE in life expectancy  72 non-null     int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 4.6+ KB
None

Summary statistics:
            Period  Hale Expectency  Life expectany  Unnamed: 6  \
count    72.000000        72.000000       72.000000   72.000000   
mean   2011.000000        61.403333      

In [4]:
if merged_mortality is not None:
    print("Merged Mortality Data:")
    print(merged_mortality.head())

Merged Mortality Data:
      Location  Period                                          Indicator  \
0  Afghanistan    2019  Infant mortality rate (probability of dying be...   
1  Afghanistan    2019  Infant mortality rate (probability of dying be...   
2  Afghanistan    2019  Infant mortality rate (probability of dying be...   
3  Afghanistan    2018  Infant mortality rate (probability of dying be...   
4  Afghanistan    2018  Infant mortality rate (probability of dying be...   

         Dim1        First Tooltip  
0  Both sexes  46.51 [37.71-55.82]  
1        Male    49.8 [40.2-59.95]  
2      Female  43.05 [34.82-51.96]  
3  Both sexes  48.04 [39.94-56.32]  
4        Male   51.35 [42.56-60.4]  


In [5]:
if merged_life_expectancy is not None:
    print("Merged Life Expectancy Data:")
    print(merged_life_expectancy.head())

Merged Life Expectancy Data:
      Location  Period                         Indicator        Dim1  \
0  Afghanistan    2019  Life expectancy at birth (years)  Both sexes   
1  Afghanistan    2019  Life expectancy at birth (years)        Male   
2  Afghanistan    2019  Life expectancy at birth (years)      Female   
3  Afghanistan    2015  Life expectancy at birth (years)  Both sexes   
4  Afghanistan    2015  Life expectancy at birth (years)        Male   

   First Tooltip  Hale Expectency  Life expectany  Unnamed: 6  \
0          63.21              NaN             NaN         NaN   
1          63.29              NaN             NaN         NaN   
2          63.16              NaN             NaN         NaN   
3          61.65              NaN             NaN         NaN   
4          61.04              NaN             NaN         NaN   

   % OF HALE in life expectancy  
0                           NaN  
1                           NaN  
2                           NaN  
3          

In [6]:
if merged_health_services is not None:
    print("Merged Health Services Data:")
    print(merged_health_services.head())

Merged Health Services Data:
      Location  Period                                          Indicator  \
0  Afghanistan    2017  Population using at least basic drinking-water...   
1  Afghanistan    2016  Population using at least basic drinking-water...   
2  Afghanistan    2015  Population using at least basic drinking-water...   
3  Afghanistan    2014  Population using at least basic drinking-water...   
4  Afghanistan    2013  Population using at least basic drinking-water...   

   First Tooltip Dim1  
0          57.32  NaN  
1          54.84  NaN  
2          52.39  NaN  
3          49.96  NaN  
4          47.56  NaN  


In [7]:
if merged_disease_incidence is not None:
    print("Merged Disease Incidence Data:")
    print(merged_disease_incidence.head())

Merged Disease Incidence Data:
      Location                                          Indicator  Period  \
0  Afghanistan  Reported number of people requiring interventi...    2018   
1  Afghanistan  Reported number of people requiring interventi...    2017   
2  Afghanistan  Reported number of people requiring interventi...    2016   
3  Afghanistan  Reported number of people requiring interventi...    2015   
4  Afghanistan  Reported number of people requiring interventi...    2014   

  First Tooltip Dim1  
0      14834701  NaN  
1      14786722  NaN  
2      13275429  NaN  
3      13765392  NaN  
4      13384908  NaN  


In [8]:
if merged_disease_incidence is not None:
    print("Merged Disease Incidence Data:")
    print(merged_disease_incidence.head())

Merged Disease Incidence Data:
      Location                                          Indicator  Period  \
0  Afghanistan  Reported number of people requiring interventi...    2018   
1  Afghanistan  Reported number of people requiring interventi...    2017   
2  Afghanistan  Reported number of people requiring interventi...    2016   
3  Afghanistan  Reported number of people requiring interventi...    2015   
4  Afghanistan  Reported number of people requiring interventi...    2014   

  First Tooltip Dim1  
0      14834701  NaN  
1      14786722  NaN  
2      13275429  NaN  
3      13765392  NaN  
4      13384908  NaN  


In [9]:
if merged_maternal_health is not None:
    print("Merged Maternal Health Data:")
    print(merged_maternal_health.head())

Merged Maternal Health Data:
      Location  Period                                          Indicator  \
0  Afghanistan    2017  Maternal mortality ratio (per 100 000 live bir...   
1  Afghanistan    2016  Maternal mortality ratio (per 100 000 live bir...   
2  Afghanistan    2015  Maternal mortality ratio (per 100 000 live bir...   
3  Afghanistan    2014  Maternal mortality ratio (per 100 000 live bir...   
4  Afghanistan    2013  Maternal mortality ratio (per 100 000 live bir...   

    First Tooltip  
0  638 [427-1010]  
1  673 [457-1040]  
2  701 [501-1020]  
3  786 [592-1080]  
4  810 [617-1080]  
