# Import modules and data

In [None]:
#standard imports
import pandas as pd
import numpy as np
#data visualization imports 
import matplotlib.pyplot as plt
import seaborn as sns
#sklearn imports
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score 
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
#tensorflow imports
from tensorflow.keras import Sequential
from tensorflow.keras import metrics
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
#import data
df = pd.read_csv('data/life_expectancy_data.csv')
#inspect first few rows of dataframe
df.head()

# Data cleaning

In [None]:
#call .info
df.info()
#print statement calling missing values
print(f'Dataset contains {df.isna().sum().sum()} missing data entries')
#print statement calling duplicated values
print(f'Dataset contains {df.duplicated().sum()} duplicated data entries')

In [None]:
#calling lambda function to remove unneccesary spaces with rename
df = df.rename(columns=lambda x: x.strip())
#inspect info to ensure results
df.info()

In [None]:
#define dictionary of imputing methods for for loop
impute_methods = {np.dtype('float64'): lambda col: col.fillna(col.mean()),
                  np.dtype('int64'): lambda col: col.fillna(col.median()),
                  np.dtype('O'): lambda col: col.fillna(col.mode().iloc[0])}
#outer loop that loops through each column in dataframe
for col in df.columns:
    data_type = df[col].dtype  # getting data type for each column
    #inner loop through impute methods to see if present
    if data_type in impute_methods:
        impute_func = impute_methods[data_type] #get corresponding data type
        df[col] = impute_func(df[col]) #apply impute method

#call missing values to ensure loop was successful
print(f'Data contains {df.isna().sum().sum()} missing data points')

In [None]:
#defining fucntion to format column names
def format_column_name(df):
    """Make all column names lower case with underscores
       Args: Pandas DataFrame
       Returns: new DataFrame with formated column names"""
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df
#calling function for our dataframe
df = format_column_name(df)
#calling info on new df
df.info()

# Exploratory data analysis

How has the average life expectancy changed since the turn of the century? As you can see from the visual below it has rise consistently since the year 2000.

In [None]:
#grouping year by average life expectancy
avg_life_ex = df.groupby('year')['life_expectancy'].mean()
#choosing plot style
plt.style.use('fivethirtyeight')
#call line plot of average life expectancy by year 
plt.plot(avg_life_ex.index, avg_life_ex.values)
#call plot title
plt.title('Avergae life expectancy by year')
#call axis title
plt.xlabel('Year')
plt.ylabel('Life expectancy')
#rotate x labels 
plt.xticks(rotation=70)
#show plot
plt.show()

The same can be said for infant deaths and deaths under 5 years old. The two visuals below show a steady and consistent decline over the last decade

In [None]:
#grouping average infant deaths by year
avg_inf_ded = df.groupby('year')['infant_deaths'].mean()
#calling line plot of average infant deaths by year
plt.plot(avg_inf_ded.index, avg_inf_ded.values)
#calling plot title
plt.title('Average infant deaths per 1000')
#calling axis title
plt.xlabel('Year')
plt.ylabel('Infant deaths')
#rotate x labels
plt.xticks(rotation=70)
#show plot
plt.show()

In [None]:
#grouping average deaths under five by year
under5_ded = df.groupby('year')['under-five_deaths'].mean()
#calling line plot with average under five deaths by year
plt.plot(under5_ded.index, under5_ded.values)
#calling plot title
plt.title('Average deaths under 5 y/o per 1000')
#calling axis titles
plt.xlabel('Year')
plt.ylabel('Amount of deaths')
#rotate x labels
plt.xticks(rotation=70)
#show plot
plt.show()

<b>What about young adults?
 
While the deaths of those between 16 and 30 years old has gone down, 2004 and 2008 did have some big spikes and it starts to spike again showing an increase in 2014.

In [None]:
#grouping average adult mortailty by year
adultmor = df.groupby('year')['adult_mortality'].mean()
#calling line plot with average adult mortality by year
plt.plot(adultmor.index, adultmor.values)
#calling plot title
plt.title('Average young adult deaths by year')
#calling axis titles
plt.xlabel('Year')
plt.ylabel('Deaths age 16-30')
#rotate x labels
plt.xticks(rotation=70)
#show plot
plt.show()

Does alcohol consumption play a role at all? Well despite the overall decrease in death and increase in life expectancy, alcohol consumption has remained steady for the majority of the decade. Around 2011 there was a pretty significant dip followed by a huge increase in 2014.

In [None]:
#groupby alcohol average by year 
alc_year=df.groupby('year')['alcohol'].mean()
#calling line plot for average alcohol by year
plt.plot(alc_year.index, alc_year.values)
#calling plot title 
plt.title('Avergage alcohol consumption in liters')
#calling axis title
plt.xlabel('Year')
plt.ylabel('Consumption')
#rotate x labels
plt.xticks(rotation=70)
#show plot
plt.show()

How does the average BMI over the decade look in all of this? Despite the overall positives listed above, the average BMI has increased consistently since 2005.

In [None]:
#grouping average bmi by year
bmi_avg=df.groupby('year')['bmi'].mean()
#call line plot with average bmi by year
plt.plot(bmi_avg.index, bmi_avg.values)
#call title
plt.title('Average BMI')
#call axis titles
plt.xlabel('Year')
plt.ylabel('BMI')
#rotate x labels
plt.xticks(rotation=70)
#show plot
plt.show()

Below we used a scatter plot with a regression to see if there was a correlation between life expectancy and GDP. There appears to be a weak relationship between life expectancy and GDP.

In [None]:
#call scatter plot with regression line for life expectancy and gdp
sns.scatterplot(x='life_expectancy', y='gdp', data=df)
sns.regplot(x='life_expectancy', y='gdp', data=df, scatter=False, ci=None, line_kws={'color': 'red', 'label': 'Correlation Line'}, order=1, truncate=True)
#set y label limit
plt.ylim(0, max(df['gdp']))
#call titles
plt.title('Life expectancy over GDP')
#call axis titles
plt.xlabel('Life expectancy')
plt.ylabel('GDP')
#show plot
plt.show()

Does schooling effect life expectancy? The regression line on the scatter plot shows a very strong relationship with all the data points being around the regression line (red) with the exception of probable outliers. Will explore this further in statistical analysis.

In [None]:
#call scattertplot with regression line for schooling and life expectancy 
sns.scatterplot(x='life_expectancy', y='schooling', data=df)
sns.regplot(x='life_expectancy', y='schooling', data=df, scatter=False, ci=None, line_kws={'color': 'red', 'label': 'Correlation Line'}, order=1, truncate=True)
#call title
plt.title('Life expectancy over years in school')
#call axis labels
plt.xlabel('Life expectancy')
plt.ylabel('Years in school')
#show plot
plt.show()

Does alcohol consumption impact life expectancy? The regression line on the scatter plot shows a possible relationship, however since most of the data points aren't around line itself, more investigation is required to draw a possible conclusion.

In [None]:
#call scatter plot with life expectancy over alcohol consumption
sns.scatterplot(x='life_expectancy', y='alcohol', data=df)
sns.regplot(x='life_expectancy', y='alcohol', data=df, scatter=False, ci=None, line_kws={'color': 'red', 'label': 'Correlation Line'}, order=1, truncate=True)
#call title
plt.title('Life expectancy over alcohol consumption (in liters)')
#call axis labels
plt.xlabel('Life expectancy')
plt.ylabel('Consumption in liters')
#show plot
plt.show()

The data points in the scatter plot below are distributed similar to before, the polio vaccine might have a positive relationship, but it likely did not have a negative one

In [None]:
#calling scatter plot with regression line for polio vaccine and life expectancy
sns.scatterplot(x='life_expectancy', y='polio', data=df)
sns.regplot(x='life_expectancy', y='polio', data=df, scatter=False, ci=None, line_kws={'color': 'red', 'label': 'Correlation Line'}, order=1, truncate=True)
#set y label limit
plt.ylim(0, max(df['polio']))
#call title
plt.title('Life expectancy over Polio vaccine %')
#call axis label
plt.xlabel("Life expectancy")
plt.ylabel('% Vaccinated at 1')
#show plot
plt.show()

How about the Hepatitis B Vaccine? Well there seems to be a weak to no relationship between the Hepatitis B vaccine and life expectancy. While one side the regression line is populated (upper) the lower part is widely distributed.

In [None]:
#call scatter plot with life expectancy over hepatitis b vaccine
sns.scatterplot(x='life_expectancy', y='hepatitis_b', data=df)
sns.regplot(x='life_expectancy', y='hepatitis_b', data=df, scatter=False, color='red', ci=None, order=1, truncate=True)
#call title
plt.title('Life expectancy over Hepatitis B Vaccine %')
#call axis label
plt.xlabel('Life expectancy')
plt.ylabel('% Vaccinated at 1')
#show plot
plt.show()


Do more densely populated areas have higher or lower life expectancies? The regression line on the scatter plot below is flat, so there appears to be no relationship between population and life expectancy.

In [None]:
#call scatter plot with life expectancy and population with regression line
sns.scatterplot(x='life_expectancy', y='population', data=df)
sns.regplot(x='life_expectancy', y='population', data=df, scatter=False, color='red', ci=None, order=1, truncate=True)
#call title
plt.title('Life expectancy over population')
#call axis labels
plt.xlabel('Life expectancy')
plt.ylabel('Population')
#show plot
plt.show()

It appears infant deaths also have no relationship to life expectancy due to how flat the regression line in on the scatter plot below.

In [None]:
#call scatter plot with life expectancy over infant deaths
sns.scatterplot(x='life_expectancy', y='infant_deaths', data=df)
sns.regplot(x='life_expectancy', y='infant_deaths', data=df, scatter=False, color='red', ci=None, order=1, truncate=True)
#call title
plt.title('Life expectancy over infant deaths')
#call axis labels 
plt.xlabel('Life expectancy')
plt.ylabel('Infant deaths')
#show plot
plt.show()

Does the rate of adult mortality (ages 16-30) have an impact on life expectancy? Assuming all the data points on the lower part of the scatter plot are outliers then yes, but we will inspect this during statistical analysis. 

In [None]:
#call scatter plot with life expectancy over adult mortality
sns.scatterplot(x='life_expectancy', y='adult_mortality', data=df)
sns.regplot(x='life_expectancy', y='adult_mortality', data=df, scatter=False, color='red')
#call title
plt.title('Life expectancy over adult mortatilty')
#call axis label
plt.xlabel('Life expectancy')
plt.ylabel('Deaths age 16-30')
#show plot
plt.show()

It also appears deaths under five has no relationship to life expectancy due to how flat the regression line is on the scatter plot below.

In [None]:
#call scatter plot with life expectancy over under 5 deaths
sns.scatterplot(x='life_expectancy', y='under-five_deaths', data=df)
sns.regplot(x='life_expectancy', y='under-five_deaths', data=df, scatter=False, color='red')
#call plot title
plt.title('Life expectancy over deaths under age 5')
#call axis label
plt.xlabel('Life expectancy')
plt.ylabel('Deaths under age 5')
#show plot
plt.show()

Should countries with a life expectancy lower than 65 spend more money on health care? There seems to be no relationship, but will revisit in statistical analysis to make sure.

In [None]:
#query data with life expectancy less than 65
dead65 = df[df['life_expectancy']<65]

In [None]:
#call scatter plot with life expectancy over percentage expenditure
sns.scatterplot(x='life_expectancy', y='percentage_expenditure', data=dead65)
sns.regplot(x='life_expectancy', y='percentage_expenditure', data=dead65, scatter=False, color='red')
#set y axis limit
plt.ylim(0, max(df['percentage_expenditure']))
#call plot title
plt.title('Life expenditure over % spent on health')
#call axis label
plt.xlabel('Life expectancy')
plt.ylabel('% Spent')
#show plot
plt.show()

Does BMI play a factor in life expectancy? There appears to be a moderate relationship between life expectancy and BMI, will investigate further during statistical analysis. 

In [None]:
#call scatterplot with regression line with BMI over life expectancy
sns.scatterplot(x='life_expectancy', y='bmi', data=df)
sns.regplot(x='life_expectancy', y='bmi', data=df, scatter=False, color='red')
#call title
plt.title('Life expectancy over BMI')
#call axis labels
plt.xlabel('Life expectancy')
plt.ylabel('BMI')
#show plot
plt.show()

Below we called the describe function to call various metrics to be used to reference later.

In [None]:
#call describe function
df.describe()

# Predictive models 

## PCA

We will be using a principal component analysis (PCA) to reduce our dimensions of features for our neural network to perform better.

In [None]:
#define x and y variable
X = df.drop(columns=['life_expectancy', 'country', 'status']) #removing y variable and object columns
y = df['life_expectancy']
#instantiate train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#calling standard scaler
scaler = StandardScaler()
#scale X data 
X_train_sca = scaler.fit_transform(X_train)
X_test_sca = scaler.transform(X_test)
#instantiate pca
pca = PCA()
#fit scaled training data 
pca.fit(X_train_sca)

In [None]:
#reset style to default
plt.style.use('default')
#visual variance for each component 
plt.plot(range(1,20), pca.explained_variance_ratio_, marker='.' )
#call xticks 
plt.xticks(ticks=range(1,20), fontsize=8)
#call title
plt.title('Variance of principal compononents')
#call axis labels
plt.xlabel('Principal component')
plt.ylabel('Proportion of explained variance')
#show plot
plt.show()

It appears our ideal principal component parameter is 4 since thats where the slope ends for variance.

In [None]:
#instante new pca with new parameter 
pca4 = PCA(n_components=4)
#fit training data
pca4.fit(X_train_sca)
#transform X data
X_train_pro = pca4.transform(X_train_sca)
X_test_pro = pca4.transform(X_test_sca)
#inspecting new shape of data w/ print statements
print(f'X train processed shape: {X_train_pro.shape[1]}')
print(f'X train unprocessed shape: {X_train.shape[1]}')

We have successfully transformed out features for the neural network.

## Neural network model A

In [None]:
#instantiate instance of model
model = Sequential()
#adding first layer 
model.add(Dense(64, input_dim=X_train_pro.shape[1], activation='relu'))
#adding second layer
model.add(Dense(32, activation='relu'))
#adding output layer
model.add(Dense(1, activation='linear'))
#compile model for regression
model.compile(loss='mse', optimizer='adam', metrics=[metrics.MeanAbsoluteError(),
                                                     metrics.RootMeanSquaredError()])
#fitting data to model
history = model.fit(X_train_pro, y_train,
                   validation_data=(X_test_pro, y_test),
                   epochs=100,
                   verbose=0)