In [1]:
import pandas as pd
!pip install pycountry_convert
import pycountry_convert as pc





## Data cleaning for the years 2015 to 2019

In [2]:
years = [2015,2016,2017,2018,2019]

params = ['rank','GDP','score','health','country']
param_dict = {"rank" : "Happiness_Rank", "GDP" : "Economy(GDP)", "score": "Happiness_Score",
              "health": "Health(Life Expectancy)", "country": "Country", "Year": "Year"}

cleaned_dfs = []

def renaming_fun(x):
  for param in params:
    if param.lower() in x.lower():
        return param_dict[param]

path = "Datasets/"
for year in years:
  new_path = path+str(year)+".csv"
  df = pd.read_csv(new_path)
  cols = list(df.columns)
  df = df.rename(columns=renaming_fun)
  df["Year"] = year
  list1 = list(param_dict.values())
  df1 = df[list1]
  cleaned_dfs.append(df1)

## Data cleaning for the years 2020 and 2021

In [3]:
def renaming_fun(x):
  for param in params:
    if param.lower() == x.lower():
        return param_dict[param]

new_year = [2020,2021]
param_dict = {"Explained by: Log GDP per capita" : "Economy(GDP)", "Ladder score": "Happiness_Score",
              "Explained by: Healthy life expectancy": "Health(Life Expectancy)", "Country name": "Country", "Year": "Year"}

for year in new_year:
  new_path = path+str(year)+".csv"
  df = pd.read_csv(new_path)
  df = df.rename(columns = param_dict)
  df['Happiness_Rank'] = range(1, 1+len(df))
  df["Year"] = year
  list1 = list(param_dict.values())+['Happiness_Rank']
  print(list1)
  df1 = df[list1]
  cleaned_dfs.append(df1)
fin_df = pd.concat(cleaned_dfs)
fin_df

['Economy(GDP)', 'Happiness_Score', 'Health(Life Expectancy)', 'Country', 'Year', 'Happiness_Rank']
['Economy(GDP)', 'Happiness_Score', 'Health(Life Expectancy)', 'Country', 'Year', 'Happiness_Rank']


Unnamed: 0,Happiness_Rank,Economy(GDP),Happiness_Score,Health(Life Expectancy),Country,Year
0,1,1.39651,7.587,0.94143,Switzerland,2015
1,2,1.30232,7.561,0.94784,Iceland,2015
2,3,1.32548,7.527,0.87464,Denmark,2015
3,4,1.45900,7.522,0.88521,Norway,2015
4,5,1.32629,7.427,0.90563,Canada,2015
...,...,...,...,...,...,...
144,145,0.45100,3.512,0.00700,Lesotho,2021
145,146,1.09900,3.467,0.34000,Botswana,2021
146,147,0.36400,3.415,0.40700,Rwanda,2021
147,148,0.45700,3.145,0.24300,Zimbabwe,2021


## Fetching relevant continents from country

In [4]:
def get_continent(country_name):
  try:
    country_code = pc.country_name_to_country_alpha2(country_name, cn_name_format="default")
    continent_code = pc.country_alpha2_to_continent_code(country_code)
    continents = {
        'NA': 'North America',
        'SA': 'South America', 
        'AS': 'Asia',
        'OC': 'Australia',
        'AF': 'Africa',
        'EU': 'Europe'
    }
    return(continents[continent_code])
  except:
    if country_name == "Congo (Brazzaville)" or country_name == "Congo (Kinshasa)":
      return "Africa"
    else:
      return(country_name)

In [5]:
countries = fin_df["Country"]
final_country_list = []
for country in countries:
  final_country_list.append(get_continent(country))
fin_df["Region"] = final_country_list
set(fin_df["Region"])

{'Africa',
 'Asia',
 'Australia',
 'Europe',
 'Hong Kong S.A.R. of China',
 'Hong Kong S.A.R., China',
 'Kosovo',
 'North America',
 'North Cyprus',
 'Palestinian Territories',
 'Somaliland Region',
 'Somaliland region',
 'South America',
 'Taiwan Province of China',
 'Trinidad & Tobago'}

## Manually labelling few country's continents which were not recognized by pycountry_convert

In [6]:
fin_df["Region"]=fin_df["Region"].str.replace("Hong Kong S.A.R., China","Asia")
fin_df["Region"]=fin_df["Region"].str.replace("Hong Kong S.A.R. of China","Asia")
fin_df["Region"]=fin_df["Region"].str.replace("Kosovo","Europe")
fin_df["Region"]=fin_df["Region"].str.replace("North Cyprus","Europe")
fin_df["Region"]=fin_df["Region"].str.replace("Palestinian Territories","Asia")
fin_df["Region"]=fin_df["Region"].str.replace("Somaliland Region","Africa")
fin_df["Region"]=fin_df["Region"].str.replace("Somaliland region","Africa")
fin_df["Region"]=fin_df["Region"].str.replace("Taiwan Province of China","Asia")
fin_df["Region"]=fin_df["Region"].str.replace("Trinidad & Tobago","South America")
print(set(fin_df["Region"]))

{'Europe', 'Africa', 'South America', 'North America', 'Asia', 'Australia'}


  fin_df["Region"]=fin_df["Region"].str.replace("Hong Kong S.A.R., China","Asia")
  fin_df["Region"]=fin_df["Region"].str.replace("Hong Kong S.A.R. of China","Asia")


## Exporting the final dataframe

In [7]:
fin_df.to_csv("Datasets/World_Happiness_Ranking_2015-2021.csv",index=False)