# DATA PROCESSING
Note: Cell below will clean the data and merge all the datasets together into an excel file called "merged_data.xlsx".

In [2]:
# DATA PROCESSING

import pandas as pd
from sklearn.impute import SimpleImputer

# Load datasets
# NOTE: Modify the paths before running.
deaths_air_pollution = pd.read_excel("/content/air pollution deaths.xlsx")
hepb3_immunized = pd.read_excel("/content/immunization of hepb3 of one year old children.xlsx")
deaths_infectious_diseases = pd.read_excel("/content/infectious diseases deaths.xlsx")
life_expectancy = pd.read_excel("/content/life expectancy.xlsx")
deaths_smoking = pd.read_excel("/content/number of deaths from tobacco smoking.xlsx")
under_fifteen_mortality = pd.read_excel("/content/number of youth deaths.xlsx")
deaths_ozone_pollution = pd.read_excel("/content/ozone pollution deaths.xlsx")
health_expenditure = pd.read_excel("/content/total healthcare expenditure as share of gdp.xlsx")

# Rename variables
deaths_air_pollution.rename(columns={"Deaths that are from all causes attributed to air pollution, in both sexes aged all ages": "deaths_air_poll"}, inplace=True)
hepb3_immunized.rename(columns={"HepB3 (% of one-year-olds immunized)": "hepb3_imm"}, inplace=True)
deaths_infectious_diseases.rename(columns={"Deaths from Infectious diseases": "deaths_infect_dis"}, inplace=True)
life_expectancy.rename(columns={"Life expectancy at birth": "life_exp"}, inplace=True)
deaths_smoking.rename(columns={"Deaths that are from all causes attributed to smoking": "deaths_smoking"}, inplace=True)
under_fifteen_mortality.rename(columns={"Under fifteen mortality - Number of deaths ": "under15_mort"}, inplace=True)
deaths_ozone_pollution.rename(columns={"Deaths that are from all causes attributed to ambient ozone pollution": "deaths_ozone_poll"}, inplace=True)
health_expenditure.rename(columns={"Current health expenditure (CHE) as percentage of gross domestic product (GDP) (%)": "health_exp"}, inplace=True)

# Drop 'Code' column from each DataFrame
deaths_air_pollution.drop(columns=['Code'], inplace=True)
hepb3_immunized.drop(columns=['Code'], inplace=True)
deaths_infectious_diseases.drop(columns=['Code'], inplace=True)
life_expectancy.drop(columns=['Code'], inplace=True)
deaths_smoking.drop(columns=['Code'], inplace=True)
under_fifteen_mortality.drop(columns=['Code'], inplace=True)
deaths_ozone_pollution.drop(columns=['Code'], inplace=True)
health_expenditure.drop(columns=['Code'], inplace=True)

common_years = range(1990, 2015)
health_expenditure = health_expenditure[(health_expenditure['Year'] >= 2002)]

# Merge datasets
merged_data = deaths_air_pollution.merge(hepb3_immunized, on=["Entity", "Year"], how="inner")
merged_data = merged_data.merge(deaths_infectious_diseases, on=["Entity", "Year"], how="inner")
merged_data = merged_data.merge(life_expectancy, on=["Entity", "Year"], how="inner")
merged_data = merged_data.merge(deaths_smoking, on=["Entity", "Year"], how="inner")
merged_data = merged_data.merge(under_fifteen_mortality, on=["Entity", "Year"], how="inner")
merged_data = merged_data.merge(deaths_ozone_pollution, on=["Entity", "Year"], how="inner")
merged_data = merged_data.merge(health_expenditure, on=["Entity", "Year"], how="inner")

merged_data.reset_index(drop=True, inplace=True)
merged_data.to_excel("merged_data.xlsx", index=False)

# TRAINING

Note: Make sure you have "merged_data.xlsx" (You can get it by running the Data Processing cell) before running the cell below.

In [35]:
# TRAINING

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

# Load the dataset
dataset_path = "/content/merged_data.xlsx"
df = pd.read_excel(dataset_path)

# Drop any rows with missing values
df.dropna(inplace=True)

# Simplify features
df_simplified = df[['deaths_air_poll', 'hepb3_imm',
       'deaths_infect_dis', 'life_exp', 'deaths_smoking', 'under15_mort',
       'deaths_ozone_poll', 'health_exp']]

# Separate features and target variable
X = df_simplified.drop(columns=['health_exp'])
y = df_simplified['health_exp']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=500, batch_size=64, validation_data=(X_val_scaled, y_val), verbose=0)

# Save the trained model
with open('Patel_model.pkl', 'wb') as f:
    pickle.dump(model, f)