In [1]:
import requests as r 
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

#CLEANER LES 2 TABLES POUR GARDER COLONNES INTERESSANTES

## 1. Import datas & data cleaning

### Import data fertility from wikipedia

In [2]:
url='https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependencies_by_total_fertility_rate'
fertility=pd.read_html(url)
fert_tab=fertility[4][['Country','Fertility rate 2020 estimate (births/woman)']]

### Merge the 2 tables (fertility and happiness_202) on country

In [3]:
happy=pd.read_csv('C:/Users/auror/Project3_WorldHappiness/data/2020.CSV')
happy.rename(columns={'Country name':'Country'},inplace=True)
happy

FileNotFoundError: [Errno 2] File C:/Users/auror/Project3_WorldHappiness/data/2020.CSV does not exist: 'C:/Users/auror/Project3_WorldHappiness/data/2020.CSV'

In [None]:
merge1=pd.merge(happy,fert_tab,on='Country')
merge1

### Import and merge data about urbanization

In [None]:
url2='https://en.wikipedia.org/wiki/Urbanization_by_country'
urban=pd.read_html(url2)
urban_tab=urban[0][['Nation','Urban Population (%)']]
urban_tab.rename(columns={'Nation':'Country'},inplace=True)
urban_tab

In [None]:
merge2=pd.merge(merge1,urban_tab,on='Country')
merge2

### Import and merge data about sunshine_duration from wikipedia

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration'
html=r.get(url).content
soup=BeautifulSoup(html)

"""
1 table per continent, several rows per country(1row=1city), 
so concatenation of all the tables, group by country, with mean of sunshine duration 
"""

continent = ['africa', 'asia', 'europe','american','americas','oceania']
col=['Country', 'City', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Year', 'Ref.']
df_sunny=pd.DataFrame(columns=col)

for i in range(len(continent)):
    table=soup.select('table.sortable')[i]
    rows = table.find_all('tr')
    table =[[cell.strip() for cell in i.text.strip().split('\n') if cell!=''] for i in rows]
    df_sunny = pd.concat([df_sunny, pd.DataFrame(table[1:], columns=col)], ignore_index=True)
    
df_sunny['Year']=df_sunny['Year'].str.replace(',','')
df_sunny = df_sunny.astype({'Year': np.float64})
df_sunny = df_sunny.pivot_table(index=['Country'], values='Year', aggfunc='mean')



In [None]:
df_sunny=df_sunny.rename(columns = {'Year': 'Sunshine duration per Year'})

In [None]:
merge3=pd.merge(merge2,df_sunny,on='Country', how='left')
merge3
#not to remove too much rows, merge is on left

### Data cleaning & Final dataframe

In [None]:
df=merge3[['Country','Regional indicator','Ladder score','Logged GDP per capita','Social support','Healthy life expectancy','Freedom to make life choices','Generosity','Perceptions of corruption','Explained by: Log GDP per capita','Explained by: Social support','Explained by: Healthy life expectancy','Explained by: Freedom to make life choices','Explained by: Generosity','Explained by: Perceptions of corruption','Dystopia + residual','Fertility rate 2020 estimate (births/woman)','Urban Population (%)','Sunshine duration per Year']]

In [None]:
df

In [None]:
df.info()

In [None]:
df['Urban Population (%)'].unique()

In [None]:
#col 16 and 17 are object, to convert them one '[2]' had to be suppressed
df['Urban Population (%)']=df['Urban Population (%)'].str.replace("\[2\]", '')
df

In [None]:
#now we can convert on float all the number from the df
df = df.astype({'Fertility rate 2020 estimate (births/woman)': np.float64, 'Urban Population (%)': np.float64})
df

In [None]:
df['Sunshine duration per Year']= df['Sunshine duration per Year'].fillna('Not Applicable')

In [None]:
df.info()

In [None]:
df.to_csv('happi.csv', index=False)

## 2. Analyses on the dataframe

In [None]:
#to check all the correlation, add a column 'rank'
df['Rank']= df.index+1

In [None]:
df_corr = df.corr()
df_corr.applymap(lambda x: 'highly correlated' if (x>0.6) & (x!=1) else 'NA')

### cheking correlation between col and "explained by"

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
plt.scatter(df['Logged GDP per capita'],df['Explained by: Log GDP per capita'])

In [None]:
plt.scatter(df['Social support'],df['Explained by: Social support'])

In [None]:
plt.scatter(df['Healthy life expectancy'],df['Explained by: Healthy life expectancy'])

In [None]:
plt.scatter(df['Freedom to make life choices'],df['Explained by: Freedom to make life choices'])

In [None]:
plt.plot(df['Generosity'],df['Explained by: Generosity'])

In [None]:
plt.plot(df['Perceptions of corruption'], df['Explained by: Perceptions of corruption'])

In [None]:
fig, axes=plt.subplots(1,6)

axes[0].plot(df['Logged GDP per capita'],df['Explained by: Log GDP per capita'], label='GDP')
axes[1].plot(df['Social support'],df['Explained by: Social support'], label='Social support')
axes[2].plot(df['Healthy life expectancy'],df['Explained by: Healthy life expectancy'], label='Healthy life expectancy')
axes[3].plot(df['Freedom to make life choices'],df['Explained by: Freedom to make life choices'], label='Freedom')
axes[4].plot(df['Generosity'],df['Explained by: Generosity'], label='Generosity')
axes[5].plot(df['Perceptions of corruption'], df['Explained by: Perceptions of corruption'], label='corruption')


In [None]:
plt.scatter(df['Ladder score'], df['Perceptions of corruption'], label='Corruption/Ladder score')
plt.scatter(df['Ladder score'], df['Explained by: Log GDP per capita'], label='Log GDP/Ladder score')
plt.legend()

In [None]:
sns.regplot(x=df['Ladder score'], y=df['Fertility rate 2020 estimate (births/woman)'], fit_reg=False, scatter_kws={"color":"darkred","alpha":0.3,"s":20})

In [None]:
sns.regplot(x=df['Ladder score'], y=df['Urban Population (%)'], fit_reg=False, scatter_kws={"color":"darkblue","alpha":0.7,"s":70} )

In [None]:
sns.regplot(x=df['Ladder score'], y=df['Sunshine duration per Year'], fit_reg=False, scatter_kws={"color":"darkgreen","alpha":0.2,"s":200} )

In [None]:
df.columns