In [1]:
#import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from scipy import stats
import seaborn as sns
import scipy.stats as st
from scipy.stats import linregress

pd.set_option('display.max_rows', 800)

pd.set_option('display.max_columns', None)

In [3]:
#Read in data
Factbook_df = pd.read_csv('Resources/Factbookdata.csv')
Happy_df = pd.read_csv('Resources/cleanAndHappy.csv', index_col = 0)



FileNotFoundError: [Errno 2] File b'Resources/cleanAndHappy.csv' does not exist: b'Resources/cleanAndHappy.csv'

In [None]:
Factbook_df.head()

In [None]:
#Reformat country names before merging.
#Change _ to " " to match with Happy_df
Factbook_df['Country'] = Factbook_df['Country'].str.replace("_"," ")
Factbook_df.head(15)

In [None]:
#Change to all lowercase to match Factbook_df
Happy_df.Country = Happy_df.Country.str.lower()
Happy_df.head()

In [None]:
#Create GB on country to get mean over all years
Happy_dfgb = Happy_df.groupby('Country')
Happydfgb =  Happy_dfgb.mean().reset_index(drop = False)
Happydfgb

In [None]:
#Merge factbook data with anual average happiness ratings
merge_df = pd.merge(Happydfgb,Factbook_df, on='Country', how='inner' ) 
merge_df.head(15)
merge_df = merge_df.sort_values("Overall Rank")
merge_df.head()


In [None]:
#Heatmap of correlations
heat= merge_df.corr();
plt.figure(figsize=(16, 6))
sns.heatmap(heat, cmap='coolwarm')

In [None]:
merge_df.info()


In [None]:
#Delete column underweightchildren, and remove rows with NaN values in order to plot correlations.
del merge_df["underweightchildren"]


In [None]:
merge_df.dropna(how='any', inplace=True)
merge_df

In [None]:
#new heatmap
heat= merge_df.corr();
plt.figure(figsize=(16, 6))
sns.heatmap(heat, cmap='coolwarm')

In [None]:
#calculate correlation between Happiness and other variables. Sort by ascending value.
correlation_df = pd.DataFrame(merge_df.corr().reset_index())
correlation_df = correlation_df[['index', 'Happiness Score']]
correlation_df = correlation_df.sort_values(by='Happiness Score', ascending = True)
correlation_df

In [None]:
#Grab columns to plot against Happiness Score
merge_df.columns


In [None]:
columns = ['literacy', 'schoolyears',
       'internetpercent', 'electricpercent', 'pppPc',
       'grosssavings', 'unemployment', 'belowpoverty', 'cleanwateraccess',
       'sanitationaccess', 'obesityrate', 'netmigrationrate', 'sexratioM2F',
       'lifeexpentancy', 'fertilityrate', 'dependencyratio', 'median_age',
       'latitude', 'longitude','coastline', 'borders',
       'coastpercent', 'forestpercent', 'elevations']

In [None]:
#loop through columns and make scatter plots with regression lines
for column in columns:
    y= merge_df['Happiness Score']
    x= merge_df[f'{column}']
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
    regress_values = x * slope + intercept
    plt.plot(x,regress_values,"r-")
    plt.scatter(x,y, facecolors="green", edgecolors="black")
    plt.xlabel(f'{column}')
    plt.ylabel("Happiness Score")
    plt.title(f"Happiness Score by {column}", fontweight='bold')
    print(f"Regression Line:  y = x * {slope} + {intercept}")
    print(f"R^2: {rvalue ** 2}")
    plt.savefig(f'Images/{column}.png')
    plt.show()
    plt.close()

In [None]:
y= merge_df['Happiness Score']
x= merge_df['schoolyears']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
regress_values = x * slope + intercept
plt.plot(x,regress_values,"r-")
plt.scatter(x,y, facecolors="green", edgecolors="black")
plt.xlabel(f'{column}')
plt.ylabel("Happiness Score")
plt.title(f"Happiness Score by schoolyears", fontweight='bold')

print(f"Regression Line:  y = x * {slope} + {intercept}")
print(f"R^2: {rvalue ** 2}")
plt.savefig('Images/schoolyears.png')

In [None]:
#Multi variable regression for modeling
from sklearn import linear_model
import statsmodels.api as sm

In [None]:
#define variables
x_model= merge_df[[
       'electricpercent', 'pppPc', 'grosssavings', 'unemployment', 'cleanwateraccess',
       'sexratioM2F','dependencyratio', 'median_age', 'forestpercent']]
y_model= merge_df[['Happiness Score']]


In [None]:
#Create sample model to explore which variables to keep
x_model= sm.add_constant(x_model)
model = sm.OLS(y_model, x_model).fit()
predictions = model.predict(x_model)
model.summary()


In [None]:
#Find a country without Happiness scores to predict
#Create new merged df to explore
merge_df2 = pd.merge(Happydfgb,Factbook_df, on='Country', how='outer')
merge_df2.head()

In [None]:
#Quick glane to find a test country.
merge_df2.tail(100)

In [None]:
#Picked Barbados, because I went there for my honeymoon, and my anniversary is next week
barbados = merge_df2.loc[merge_df2['Country'] =='barbados']
barbadospredict = barbados[['electricpercent', 'pppPc', 'grosssavings', 'unemployment', 'sexratioM2F', 'dependencyratio', 'median_age', 'forestpercent']]
barbadospredict

In [None]:
#Dataframe with just modeling variables
final_df = merge_df[[
       'electricpercent', 'pppPc', 'grosssavings', 'unemployment',
       'sexratioM2F', 'dependencyratio', 'median_age', 'forestpercent']]
final_df.head()

In [None]:
x_model= final_df
y_model= merge_df[['Happiness Score']]


In [None]:
x_model= sm.add_constant(x_model)
model = sm.OLS(y_model, x_model).fit()
predictions = model.predict(x_model)
model.summary()

In [None]:
barbadospredict= sm.add_constant(barbadospredict, has_constant='add')
model.predict(barbadospredict)


In [None]:
#Predictions vs Actual Happiness Scores
residual= merge_df["Happiness Score"]- predictions 
plt.scatter(merge_df["Happiness Score"], predictions, facecolor="green", edgecolor="black")
plt.title('Predicted vs Actual Happiness score', fontweight="bold", fontsize=16)
plt.xlabel('Actual', fontweight='bold')
plt.ylabel('Predicted', fontweight ='bold')
plt.xlim(3,7.5)
plt.ylim(3,7.5)
plt.plot([3,7.5],[3,7.5], color="red")
plt.savefig('Images/Predicted_Actual.png')

In [None]:
sns.residplot(merge_df["Happiness Score"], predictions, lowess=True, color="g")
plt.savefig('Images/Residuals.png')