## Happiness Metrics for Global Life Expectancy Predictive Analysis
Nicole Chang, Sourish Guntipally, Aaron Park, Brandon To

Data used for this project is from Kaggle datasets of the [World Happiness Report](https://www.kaggle.com/unsdsn/world-happiness?select=2019.csv) and [Human Life Expectancy Around the World](https://www.kaggle.com/deepcontractor/human-life-expectancy-around-the-world).

The years we are analyzing are from 2015 - 2019.

In [216]:
import scipy as sp, numpy as np, pandas as pd
from sklearn.cross_decomposition import PLSRegression
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [227]:
#World Happiness Report (WHR)
#Human Life Expectancy Around the World (HLE)

#HLE
dataHLE = pd.read_csv('Human_life_Expectancy.csv')

#2015 data
dataWHR2015 = pd.read_csv('2015.csv') #WHR

#2016 data
dataWHR2016 = pd.read_csv('2016.csv') #WHR

#2017 data
dataWHR2017 = pd.read_csv('2017.csv') #WHR

#2018 data
dataWHR2018 = pd.read_csv('2018.csv') #WHR

#2019 data
dataWHR2019 = pd.read_csv('2019.csv') #WHR

In [228]:
#Configuring data into dataframes that are easier to use 

#Labels are the names of the measurements (countries) for all 5 years
labels = [dataWHR2015['Country'].astype(str), dataWHR2016['Country'].astype(str), dataWHR2017['Country'].astype(str),dataWHR2018['Country'].astype(str), dataWHR2019['Country'].astype(str)] 
count = np.zeros(len(dataWHR2015['Country'])) #Array to keep track of occurences of countries in 5 years

recurringCountries = {}

for i in range(len(dataWHR2015['Country'])):
    recurringCountries[labels[0][i]] = 1


for year in range(4):                                                      #Iterates for each year after 2015
    for countryind in range(len(dataWHR2015['Country'])):                  #Iterates through country labels for 2015
        for matchind in range(len(labels[year+1][:])):                     #Finds matches between 2015 label and other years
            if labels[0][countryind] == labels[year+1][matchind]:
                count[countryind] = count[countryind] + 1 
                recurringCountries[labels[0][countryind]] += 1
                
#delete all the countries that don't appear in every year out of the dictionary
for i in range(len(recurringCountries)):
    if recurringCountries[labels[0][i]] < 5:
        recurringCountries.pop(labels[0][i])


In [229]:
#deleting countries using the dictionary
del_countries2015 = []

for i in range(len(dataWHR2015['Country'])):
    if labels[0][i] not in recurringCountries:
        del_countries2015.append(labels[0][i])

del_countries2016 = []

for i in range(len(dataWHR2016['Country'])):
    if labels[1][i] not in recurringCountries:
        del_countries2016.append(labels[1][i])

del_countries2017 = []

for i in range(len(dataWHR2017['Country'])):
    if labels[2][i] not in recurringCountries:
        del_countries2017.append(labels[2][i])
        
del_countries2018 = []

for i in range(len(dataWHR2018['Country'])):
    if labels[3][i] not in recurringCountries:
        del_countries2018.append(labels[3][i])
        
del_countries2019 = []

for i in range(len(dataWHR2019['Country'])):
    if labels[4][i] not in recurringCountries:
        del_countries2019.append(labels[4][i])
        

In [230]:
#feel free to rename these dataframes

#new dataframes with dropped countries
#Syria does not appear in HLE which is why it is dropped in all other dataframes

data_with_index1 = dataWHR2015.set_index('Country')
data_with_index1 = data_with_index1.drop(del_countries2015)
data_with_index1 = data_with_index1.drop('Syria')

data_with_index2 = dataWHR2016.set_index('Country')
data_with_index2 = data_with_index2.drop(del_countries2016)
data_with_index2 = data_with_index2.drop('Syria')

data_with_index3 = dataWHR2017.set_index('Country')
data_with_index3 = data_with_index3.drop(del_countries2017)
data_with_index3 = data_with_index3.drop('Syria')

data_with_index4 = dataWHR2018.set_index('Country')
data_with_index4 = data_with_index4.drop(del_countries2018)
data_with_index4 = data_with_index4.drop('Syria')

data_with_index5 = dataWHR2019.set_index('Country')
data_with_index5 = data_with_index5.drop(del_countries2019)
data_with_index5 = data_with_index5.drop('Syria')



In [233]:
#Configuring HLE dataframe
dataHLE = dataHLE.loc[dataHLE['Level'] == 'National']
data_with_index6 = dataHLE.set_index('Country')
data_with_index6 = data_with_index6.drop('Syria')

dataHLE2015 = data_with_index6['2015'] #HLE
dataHLE2016 = data_with_index6['2016'] #HLE
dataHLE2017 = data_with_index6['2017'] #HLE
dataHLE2018 = data_with_index6['2018'] #HLE
dataHLE2019 = data_with_index6['2019'] #HLE

In [199]:
# if you want to drop columns, here's an example below
#data_with_index1 = data_with_index1.drop(['Region', 'Standard Error'], axis=1)

In [200]:
# To alphabetize, follow below example.
# I left it like this so that we can call sort_values when needed. 
# Should just keep the main data frame sorted by overall rank by default
#data_with_index1.sort_values('Country') #sorting A-Z
#data_with_index1.sort_values('Country', ascending=False) #sorting Z-A