# Pre-processing data set

In [218]:
import pandas as pd
import numpy as np

df = pd.read_csv('world-happiness-report-2021.csv')

## Feature selection
Drop irrelevant features 

In [219]:
# drop columns
df = df.drop([
  'Country name', 'Standard error of ladder score', 'upperwhisker','lowerwhisker',
  'Ladder score in Dystopia', 'Dystopia + residual', 'Logged GDP per capita',
  'Social support', 'Healthy life expectancy', 'Freedom to make life choices',
  'Generosity', 'Perceptions of corruption', 'Regional indicator'
   ], axis=1)

In [220]:
df

Unnamed: 0,Ladder score,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption
0,7.842,1.446,1.106,0.741,0.691,0.124,0.481
1,7.620,1.502,1.108,0.763,0.686,0.208,0.485
2,7.571,1.566,1.079,0.816,0.653,0.204,0.413
3,7.554,1.482,1.172,0.772,0.698,0.293,0.170
4,7.464,1.501,1.079,0.753,0.647,0.302,0.384
...,...,...,...,...,...,...,...
144,3.512,0.451,0.731,0.007,0.405,0.103,0.015
145,3.467,1.099,0.724,0.340,0.539,0.027,0.088
146,3.415,0.364,0.202,0.407,0.627,0.227,0.493
147,3.145,0.457,0.649,0.243,0.359,0.157,0.075


In [221]:
# check data types
df.dtypes

Ladder score                                  float64
Explained by: Log GDP per capita              float64
Explained by: Social support                  float64
Explained by: Healthy life expectancy         float64
Explained by: Freedom to make life choices    float64
Explained by: Generosity                      float64
Explained by: Perceptions of corruption       float64
dtype: object

In [222]:
df.describe()

Unnamed: 0,Ladder score,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption
count,149.0,149.0,149.0,149.0,149.0,149.0,149.0
mean,5.532839,0.977161,0.793315,0.520161,0.498711,0.178047,0.135141
std,1.073924,0.40474,0.258871,0.213019,0.137888,0.09827,0.114361
min,2.523,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.852,0.666,0.647,0.357,0.409,0.105,0.06
50%,5.534,1.025,0.832,0.571,0.514,0.164,0.101
75%,6.255,1.323,0.996,0.665,0.603,0.239,0.174
max,7.842,1.751,1.172,0.897,0.716,0.541,0.547


In [223]:
# Convert numeric explained by data into categorical

# define the range of values 
bins = np.arange(0, 1.9, 0.1)

# use pd.cut() to map the numerical values to the categorical values
for col in df.columns:
    if col not in ['Regional indicator', 'Ladder score']:
        df[col] = pd.cut(df[col], bins=bins, include_lowest=True)


In [224]:
# Convert ladder score data into categorical

# define the range of values 
bins = np.arange(0, 8.5, 0.5)

# use pd.cut() to map the numerical values to the categorical values
for col in df.columns:
    if col == 'Ladder score':
        df[col] = pd.cut(df[col], bins=bins, include_lowest=True)



In [225]:
df.dtypes

Ladder score                                  category
Explained by: Log GDP per capita              category
Explained by: Social support                  category
Explained by: Healthy life expectancy         category
Explained by: Freedom to make life choices    category
Explained by: Generosity                      category
Explained by: Perceptions of corruption       category
dtype: object

In [226]:
df

Unnamed: 0,Ladder score,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption
0,"(7.5, 8.0]","(1.4, 1.5]","(1.1, 1.2]","(0.7, 0.8]","(0.6, 0.7]","(0.1, 0.2]","(0.4, 0.5]"
1,"(7.5, 8.0]","(1.5, 1.6]","(1.1, 1.2]","(0.7, 0.8]","(0.6, 0.7]","(0.2, 0.3]","(0.4, 0.5]"
2,"(7.5, 8.0]","(1.5, 1.6]","(1.0, 1.1]","(0.8, 0.9]","(0.6, 0.7]","(0.2, 0.3]","(0.4, 0.5]"
3,"(7.5, 8.0]","(1.4, 1.5]","(1.1, 1.2]","(0.7, 0.8]","(0.6, 0.7]","(0.2, 0.3]","(0.1, 0.2]"
4,"(7.0, 7.5]","(1.5, 1.6]","(1.0, 1.1]","(0.7, 0.8]","(0.6, 0.7]","(0.3, 0.4]","(0.3, 0.4]"
...,...,...,...,...,...,...,...
144,"(3.5, 4.0]","(0.4, 0.5]","(0.7, 0.8]","(-0.001, 0.1]","(0.4, 0.5]","(0.1, 0.2]","(-0.001, 0.1]"
145,"(3.0, 3.5]","(1.0, 1.1]","(0.7, 0.8]","(0.3, 0.4]","(0.5, 0.6]","(-0.001, 0.1]","(-0.001, 0.1]"
146,"(3.0, 3.5]","(0.3, 0.4]","(0.2, 0.3]","(0.4, 0.5]","(0.6, 0.7]","(0.2, 0.3]","(0.4, 0.5]"
147,"(3.0, 3.5]","(0.4, 0.5]","(0.6, 0.7]","(0.2, 0.3]","(0.3, 0.4]","(0.1, 0.2]","(-0.001, 0.1]"


In [227]:
# check for null values
df.isna().sum()

Ladder score                                  0
Explained by: Log GDP per capita              0
Explained by: Social support                  0
Explained by: Healthy life expectancy         0
Explained by: Freedom to make life choices    0
Explained by: Generosity                      0
Explained by: Perceptions of corruption       0
dtype: int64