## Andrew Ingrassia
## World Happiness Report
## Geographic Visualization

***
## Contents

##### 1) Importing libraries & data

##### 2) Data wrangling
    - 2a. Locating discrepancies in country name formats
    - 2b. Fixing discrepancies
    
##### 3) Creating the choropleth map using Plotly

##### 4) Observations

***
## 1) Importing libraries & data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import plotly.express as px
import plotly.io as pio
import json

In [2]:
# Enables the display of matplotlib plots directly within the notebook interface

%matplotlib inline

In [3]:
# Defines path to WHR datapath = r'C:\Users\ingra\OneDrive\Desktop

path = r'C:\Users\ingra\OneDrive\Desktop\Data Analysis\World Happiness Report\WHR - Data'

In [4]:
# Imports the most recent iteration of WHR data and assigns it to the variable 'df'

df = pd.read_csv(os.path.join(path, 'whr_df_whr2.csv'))

In [5]:
# Imports the simplified version of the 'countries.geojson' data and assigns it to the variable 'country_geo'

geojson_file_path = r"C:\Users\ingra\OneDrive\Desktop\Data Analysis\World Happiness Report\WHR - Data\simplified_geojson.geojson"

In [6]:
# Loads the country geometries from the JSON file and assigns them to the 'country_geo' dictionary

with open(geojson_file_path, 'r') as geo_file:
    country_geo = json.load(geo_file)

In [7]:
print(country_geo.keys())               # Prints the keys of the dictionary
print(country_geo['features'][:5])      # Prints the first 5 features

dict_keys(['type', 'crs', 'features'])
[{'type': 'Feature', 'properties': {'ADMIN': 'Aruba', 'ISO_A3': 'ABW'}, 'geometry': {'type': 'Polygon', 'coordinates': [[[-69.99693762899992, 12.577582098000036], [-69.92467200399994, 12.519232489000046], [-69.88019771999984, 12.453558661000045], [-69.88809160099993, 12.417669989000046], [-69.93053137899989, 12.425970770000035], [-69.94513912699992, 12.44037506700009], [-69.92467200399994, 12.447211005000014], [-70.05809485599988, 12.537176825000088], [-70.04873613199993, 12.583726304000024], [-70.06110592399997, 12.625392971000068], [-70.04873613199993, 12.632147528000104], [-69.99693762899992, 12.577582098000036]]]}}, {'type': 'Feature', 'properties': {'ADMIN': 'Afghanistan', 'ISO_A3': 'AFG'}, 'geometry': {'type': 'Polygon', 'coordinates': [[[71.04980228700009, 38.40866445000009], [71.11739506000004, 38.39863922100008], [71.2176990160001, 38.32582712800007], [71.3343843990001, 38.28066192700001], [71.35815555800013, 38.25125803700013], [71.35908

***
## 2) Data wrangling

### 2a. Locating discrepancies in country name formats

In [8]:
# Extracts desired columns from WHR data (for use in choropleth map)
df_data = df[['country', 'happiness_score', 'year']].copy()

# Views the resulting dataframe
df_data.head()

Unnamed: 0,country,happiness_score,year
0,Switzerland,7.587,2015
1,Iceland,7.561,2015
2,Denmark,7.527,2015
3,Norway,7.522,2015
4,Canada,7.427,2015


In [9]:
# Extracts all country names from df dataframe and sort them alphabetically
df_country_names = sorted(df['country'].unique())

# Prints the resulting list
print(df_country_names)

['Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada', 'Chad', 'Chile', 'China', 'Colombia', 'Congo (Brazzaville)', 'Costa Rica', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kosovo', 'Kyrgyzstan', 'Latvia', 'Lebanon', 'Liberia', 'Lithuania', 'Luxembourg', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Malta', 'Mauritania', 'Mauritius', 'Mexico', 'Moldova', 'Mongolia', 'Montenegro', 'Morocco', 'Myanmar', 'Nepal', 'Netherlands

In [10]:
# Extracts all country names from 'country_geo' and sort them alphabetically
geo_country_names = sorted([feature['properties']['ADMIN'] for feature in country_geo['features']])

# Prints the resulting list
print(geo_country_names)

['Afghanistan', 'Akrotiri Sovereign Base Area', 'Aland', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Ashmore and Cartier Islands', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bajo Nuevo Bank (Petrel Is.)', 'Bangladesh', 'Barbados', 'Baykonur Cosmodrome', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'British Indian Ocean Territory', 'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Clipperton Island', 'Colombia', 'Comoros', 'Cook Islands', 'Coral Sea Islands', 'Costa Rica', 'Croatia', 'Cuba', 'CuraÃ§ao', 'Cyprus', 'Cyprus No Mans Area', 'Czech Republic', 'Democratic Republic of the Congo', 'Denmark', 'Dhekelia Sovereign Base Area', 'Djibouti', 'Dominica', 'Dominica

In [11]:
# Extracts unique country names from df_data
unique_df_country_names = df_data['country'].unique()

# Extracts country names from country_geo
unique_geo_country_names = [feature['properties']['ADMIN'] for feature in country_geo['features']]

# Finds discrepancies between the two lists
discrepancies = set(unique_df_country_names) - set(unique_geo_country_names)
if discrepancies:
    print("Discrepancies between df and country_geo:", sorted(discrepancies))
else:
    print("Country names are consistent between df and country_geo.")

Discrepancies between df and country_geo: ['Congo (Brazzaville)', 'Hong Kong', 'Serbia', 'Tanzania', 'United States']


### 2b. Fixing discrepancies

In [12]:
# Defines the desired country name changes as a dictionary
country_name_changes = {
    'Democratic Republic of the Congo': 'Congo (Kinshasa)',
    'Republic of Congo': 'Congo (Brazzaville)',
    'Hong Kong S.A.R.': 'Hong Kong',
    'Northern Cyprus': 'North Cyprus',
    'Republic of Serbia': 'Serbia',
    'United Republic of Tanzania': 'Tanzania',
    'United States of America': 'United States'
}

In [13]:
# Loops through the features in the 'country_geo' dictionary and update country names
for feature in country_geo['features']:
    properties = feature['properties']
    old_name = properties['ADMIN']
    
    if old_name in country_name_changes:
        new_name = country_name_changes[old_name]
        properties['ADMIN'] = new_name

In [14]:
# Extracts unique country names from the 'df' dataframe
df_country_names = df['country'].unique()

# Extracts country names from the updated 'country_geo' dictionary
geo_country_names = [feature['properties']['ADMIN'] for feature in country_geo['features']]

# Finds discrepancies between the two lists
discrepancies = set(df_country_names) - set(geo_country_names)

if discrepancies:
    print("Discrepancies between df and updated country_geo:", discrepancies)
else:
    print("Country names are consistent between df and updated country_geo.")

Country names are consistent between df and updated country_geo.


In [15]:
# Save the modified country_geo dictionary as a JSON file on your desktop
desktop_path = r'C:\Users\ingra\OneDrive\Desktop'
output_file_path = os.path.join(desktop_path, 'country_geo.json')

with open(output_file_path, 'w') as json_file:
    json.dump(country_geo, json_file)

***
## 3) Creating the choropleth map

In [None]:
fig = px.choropleth(
    data_frame = df_data,                           # Dataframe containing WHR data
    geojson = country_geo,                          # Dictionary containing geometric data
    locations = 'country',                          # Column in 'df_data' containing country names
    featureidkey = 'properties.ADMIN',              # Key in GeoJSON to match with the locations
    color = 'happiness_score',                      # Column in 'df_data' with values to color
    color_continuous_scale = 'Blues',               # Sets the color scheme
    animation_frame = 'year',                       # Column in 'df_data' containing years (for animation)
    title = 'World Happiness Report 2015 - 2023',   # Sets the plot title
    hover_name = 'country',                         # Column in 'df_data' for hover information
)

Updates the legend title
fig.update_layout(
    coloraxis_colorbar_title = 'Happiness Score',   # Changes the legend title
)


fig.show()

In [None]:
# Defines the file path to my desktop
desktop_path = os.path.expanduser(r'C:\Users\ingra\OneDrive\Desktop')
file_path = os.path.join(desktop_path, 'whr_choropleth.html')

# Saves the interactive visualization as an HTML file
pio.write_html(fig, file_path)

***
## 4) Observations 

###### 1. Regions with consistently high happiness scores relative to other regions: Western Hemisphere, Europe, Saudi Arabia

###### 2. Regions with consistently low happiness scores relative to other regions: Africa, India

***
## 5) Exports

In [16]:
# Exports the 'df_data' dataframe

# df_data.to_csv(r'C:\Users\ingra\OneDrive\Desktop\df_data_WHR3.csv', index=False)