In [None]:
import pandas as pd
from holoviews.plotting.bokeh.styles import font_size
from mypyc.primitives.misc_ops import dataclass_sleight_of_hand

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_rows", None)
import numpy as py
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

import numpy as np


In [None]:
path_to_file = "C:/Users/Intan/DataAnalytics_AllWomen/Capstone_Project/Capstone_Project/Dataset-Happiness_Score.csv"
data = pd.read_csv(path_to_file)

In [None]:
# rename the column title by replacing space to underscore, to fix formatting
data.columns = data.columns.str.replace(" ", "_") 

In [None]:
# delete these columns as these dont have any useful info
data = data.drop(["Unnamed:_0.1", "Unnamed:_0"], axis=1) 

#==========================================================================
# (1) DATA CLEANING
#============================================================================

In [None]:
print(data.shape) # (312, 10)
print(data.info()) # info about the data float64(7), int64(2), object(1)
print(data.head()) # to check the first couple of rows

In [None]:
print(data.isnull().sum()) 
print(data.isnull().sum()*100/len(data)) 
print(data["Overall rank"].value_counts())
print(data["Country_or_region"].value_counts())
print(data["Score"].value_counts())
print(data["GDP per capita"].value_counts())
print(data["Social support"].value_counts())
print(data["Healthy life expectancy"].value_counts())
print(data["Freedom to make life choices"].value_counts())
print(data["Generosity"].value_counts())
print(data["Perceptions of corruption"].value_counts())
print(data["year"].value_counts()) # 2018: 156 data and 2019: 156 data

In [None]:
data_2018 = data[data['year'] == 2018]
data_2019 = data[data['year'] == 2019]

In [None]:
print(data_2019.isnull().sum()*100/len(data_2019)) # missing values in Healthy_life_expectancy: 99%, perceptions of corruption: 0.3%
print(data_2018.isnull().sum()*100/len(data_2018)) # missing values in Healthy_life_expectancy: 96%
print(data_2018.duplicated().sum()) # result:0
print(data_2019.duplicated().sum()) # result:0

In [None]:
data_2018.dropna(subset=["Perceptions_of_corruption"], axis=0, inplace=True)

In [None]:
data_2018 = data_2018.drop(["Healthy_life_expectancy"], axis=1)
data_2019 = data_2019.drop(["Healthy_life_expectancy"], axis=1)

#==============================================
# (2) UNIVARIATE ANALYSIS
#==============================================

# (2.1) OUTLIER DETECTION & HANDLING
IQR to set confidence limit for all affected columns - 2018 & 2019

In [None]:
columns = ['Score', 'GDP_per_capita', 'Social_support', 'Freedom_to_make_life_choices', 'Generosity','Perceptions_of_corruption']
data_iqr_2018 = data_2018.copy()
for col in columns:
    q1 = data_iqr_2018[col].quantile(0.25)
    q3 = data_iqr_2018[col].quantile(0.75)
    iqr = q3 - q1
    lower_limit = (q1 - 1.5 * iqr)
    upper_limit = (q3 + 1.5 * iqr)
    data_iqr_2018 = data_iqr_2018[(data_iqr_2018[col] > lower_limit) & (data_iqr_2018[col] < upper_limit)]

data_iqr_2019 = data_2019.copy()
for col in columns:
    q1 = data_iqr_2019[col].quantile(0.25)
    q3 = data_iqr_2019[col].quantile(0.75)
    iqr = q3 - q1
    lower_limit = (q1 - 1.5 * iqr)
    upper_limit = (q3 + 1.5 * iqr)
    data_iqr_2019 = data_iqr_2019[(data_iqr_2019[col] > lower_limit) & (data_iqr_2019[col] < upper_limit)]

# (2.2) Histograms of relevant columns to look at the insight of the distribution

In [None]:
# Histogram of 2018

columns = ['Score', 'GDP_per_capita', 'Social_support', 'Freedom_to_make_life_choices', 'Generosity','Perceptions_of_corruption']
plt.figure(figsize=(20, 50))
for i, col in enumerate(columns, 1):
    plt.subplot(3, 2, i)
    sns.histplot(data_iqr_2018[col], bins=20, kde=True, color="skyblue", edgecolor="black")
    plt.title(f'Distribution of {col} 2018', fontsize=12, fontweight='bold')
    plt.ylabel("Frequency")
    plt.xlabel(col)

    mean = np.mean(data_iqr_2018[col])
    median = np.median(data_iqr_2018[col])
    plt.axvline(mean, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean:.2f}')
    plt.axvline(median, color='green', linestyle='dashed', linewidth=1, label=f'Median: {median:.2f}')
    plt.legend(loc='upper right')

    plt.subplots_adjust(hspace=0.5, wspace=0.3)

plt.show()

In [None]:
# Histogram of 2019

columns = ['Score', 'GDP_per_capita', 'Social_support', 'Freedom_to_make_life_choices', 'Generosity','Perceptions_of_corruption']
plt.figure(figsize=(20, 50))
for i, col in enumerate(columns, 1):
    plt.subplot(3, 2, i)
    sns.histplot(data_iqr_2019[col], bins=20, kde=True, color="skyblue", edgecolor="black")
    plt.title(f'Distribution of {col} 2019', fontsize=12, fontweight='bold')
    plt.ylabel("Frequency")
    plt.xlabel(col)

    mean = np.mean(data_iqr_2019[col])
    median = np.median(data_iqr_2019[col])
    plt.axvline(mean, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean:.2f}')
    plt.axvline(median, color='green', linestyle='dashed', linewidth=1, label=f'Median: {median:.2f}')
    plt.legend(loc='upper right')

    plt.subplots_adjust(hspace=0.5, wspace=0.3)

plt.show()

#==============================================
# (3) BIVARIATE ANALYSIS
#==============================================

# (3.1) Heatmap for above factors

In [None]:
# Data Heatmap 2018 and 2019
data_2018_heatmap = data_iqr_2018[columns]
data_2019_heatmap = data_iqr_2019[columns]
# Correlation Matrix 2018 and 2019
corr_2018 = data_2018_heatmap.corr()
corr_2019 = data_2019_heatmap.corr()

In [None]:
# Plot Heatmap 2018

plt.figure(figsize=(30,30))
sns.heatmap(corr_2018, annot=True)
plt.title("Correlation Heatmap of Happiness Parameters 2018", fontsize=16)
plt.xticks(rotation=0)
plt.yticks(rotation=45)
plt.show()

In [None]:
# Plot Heatmap 2019

plt.figure(figsize=(30,30))
sns.heatmap(corr_2019, annot=True)
plt.title("Correlation Heatmap of Happiness Parameters 2019", fontsize=16)
plt.xticks(rotation=0)
plt.yticks(rotation=45)
plt.show()

# (3.2) SCATTERPLOT - GDP_per_capita vs Score

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data_iqr_2018['GDP_per_capita'], data_iqr_2018['Score'], color='blue', alpha=0.7, label='2018', s=60)
plt.scatter(data_iqr_2019['GDP_per_capita'], data_iqr_2019['Score'], color='orange', alpha=0.7, label='2019', s=60)
sns.regplot(data=data_iqr_2018, x='GDP_per_capita', y='Score', scatter=False, color='blue', line_kws={"linewidth": 2})
sns.regplot(data=data_iqr_2019, x='GDP_per_capita', y='Score', scatter=False, color='orange', line_kws={"linewidth": 2})
plt.title('GDP per Capita vs Score (2018 and 2019)', fontsize=14)
plt.xlabel('GDP per Capita', fontsize=12)
plt.ylabel('Happiness Score', fontsize=12)
plt.legend(title='Year')
plt.grid(True)
plt.show()

# (3.3) Scatterplot Social Support vs Score

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data_iqr_2018['Social_support'], data_iqr_2018['Score'], color='blue', alpha=0.7, label='2018')
plt.scatter(data_iqr_2019['Social_support'], data_iqr_2019['Score'], color='orange', alpha=0.7, label='2019')
sns.regplot(data=data_iqr_2018, x='Social_support', y='Score', scatter=False, color='blue', line_kws={"linewidth": 2})
sns.regplot(data=data_iqr_2019, x='Social_support', y='Score', scatter=False, color='orange', line_kws={"linewidth": 2})
plt.title('Social support vs Score (2018 and 2019)', fontsize=14)
plt.xlabel('Social support', fontsize=12)
plt.ylabel('Happiness Score', fontsize=12)
plt.legend(title='Year')
plt.grid(True)
plt.show()

# (3.4) Scatterplot Freedom_to_make_life_choices vs Score

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data_iqr_2018['Freedom_to_make_life_choices'], data_iqr_2018['Score'], color='blue', alpha=0.7, label='2018')
plt.scatter(data_iqr_2019['Freedom_to_make_life_choices'], data_iqr_2019['Score'], color='orange', alpha=0.7, label='2019')
sns.regplot(data=data_iqr_2018, x='Freedom_to_make_life_choices', y='Score', scatter=False, color='blue', line_kws={"linewidth": 2})
sns.regplot(data=data_iqr_2019, x='Freedom_to_make_life_choices', y='Score', scatter=False, color='orange', line_kws={"linewidth": 2})
plt.title('Freedom_to_make_life_choices vs Score (2018 and 2019)', fontsize=14)
plt.xlabel('Freedom_to_make_life_choices', fontsize=12)
plt.ylabel('Happiness Score', fontsize=12)
plt.legend(title='Year')
plt.grid(True)
plt.show()

# (3.5) Scatterplot GDP_per_capita vs Social_support

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data_iqr_2018['GDP_per_capita'], data_iqr_2018['Social_support'], color='blue', alpha=0.7, label='2018')
plt.scatter(data_iqr_2019['GDP_per_capita'], data_iqr_2019['Social_support'], color='orange', alpha=0.7, label='2019')
sns.regplot(data=data_iqr_2018, x='GDP_per_capita', y='Social_support', scatter=False, color='blue', line_kws={"linewidth": 2})
sns.regplot(data=data_iqr_2019, x='GDP_per_capita', y='Social_support', scatter=False, color='orange', line_kws={"linewidth": 2})
plt.title('GDP_per_capita vs Social_support (2018 and 2019)', fontsize=14)
plt.xlabel('GDP_per_capita', fontsize=12)
plt.ylabel('Social_support', fontsize=12)
plt.legend(title='Year')
plt.grid(True)
plt.show()