In [None]:
# --- Upload the raw data ---

import pandas as pd

# load the three raw data files
esg_df = pd.read_csv("sustainability-economic-performance/data/raw/esg-economic-data.csv")
gdp_df = pd.read_csv("sustainability-economic-performance/data/raw/gdp-inflation-fdi-data.csv")
class_df = pd.read_excel("sustainability-economic-performance/data/raw/country-classification.xlsx")

# print a preview of each one to see what it looks like
print("ESG + other indicators:")
display(esg_df.head())

print("\nGDP, Inflation, FDI:")
display(gdp_df.head())

print("\nCountry classification:")
display(class_df.head())

In [None]:
# --- Clean ESG dataset structure ---

# drop series code column (not useful for the project)
esg_clean = esg_df.drop(columns=["Series Code"])

# reshape the dataset from wide format to long format
year_cols = [col for col in esg_clean.columns if "[" in col] 

esg_long = esg_clean.melt(
    id_vars=["Country Name", "Country Code", "Series Name"],
    value_vars=year_cols,
    var_name="Year",
    value_name="Value"
)

# rename the "Series Name" column to "Indicator"
esg_long = esg_long.rename(columns={"Series Name": "Indicator"})

# clean Year column name: keep only the 4 digits
esg_long["Year"] = esg_long["Year"].str.slice(0, 4)

# convert Year to integer
esg_long["Year"] = esg_long["Year"].astype(int)

# display the first 10 rows to see if it worked
esg_long.head(10)

In [None]:
# --- Clean GDP/Inflation/FDI dataset structure ---

# drop series code column (not useful for the project)
gdp_clean = gdp_df.drop(columns=["Series Code"])

# reshape the dataset from wide format to long format
year_cols_gdp = [col for col in gdp_clean.columns if "[" in col]

gdp_long = gdp_clean.melt(
    id_vars=["Country Name", "Country Code", "Series Name"],
    value_vars=year_cols_gdp,
    var_name="Year",
    value_name="Value"
)

# rename the "Series Name" column to "Indicator"
gdp_long = gdp_long.rename(columns={"Series Name": "Indicator"})

# clean Year column name: keep only the 4 digits
gdp_long["Year"] = gdp_long["Year"].str.slice(0, 4)

# convert Year to integer
gdp_long["Year"] = gdp_long["Year"].astype(int)

# display the first 10 rows to see if it worked
gdp_long.head(10)

In [None]:
# --- Add a Category column to the ESG dataset ---

def assign_category(indicator_name):
    indicator_name_lower = str(indicator_name).lower()
    
    # Environmental
    if "co2" in indicator_name_lower:
        return "Environmental"
    if "fossil" in indicator_name_lower:
        return "Environmental"
    if "renewable" in indicator_name_lower:
        return "Environmental"
    if "methane" in indicator_name_lower:
        return "Environmental"
    if "nitrous" in indicator_name_lower:
        return "Environmental"
    
    # Social
    if "unemployment" in indicator_name_lower:
        return "Social"
    if "gini" in indicator_name_lower:
        return "Social"
    if "rights" in indicator_name_lower:
        return "Social"

    # Governance
    if "corruption" in indicator_name_lower:
        return "Governance"
    if "political" in indicator_name_lower:
        return "Governance"

    # Economic
    if "gdp" in indicator_name_lower:
        return "Economic"
    if "expenditure" in indicator_name_lower:
        return "Economic"

    return "Other"

# apply to ESG data
esg_long["Category"] = esg_long["Indicator"].apply(assign_category)

# display a table showing each indicator in the dataset and its assigned category
esg_long[["Indicator", "Category"]].drop_duplicates().sort_values("Indicator")

In [None]:
# --- Add a Category column to the gdp-inflation-fdi dataset ---

def assign_economic_category(indicator_name):
    indicator_name_lower = str(indicator_name).lower()

    #Economic
    if "gdp" in indicator_name_lower:
        return "Economic"
    if "inflation" in indicator_name_lower:
        return "Economic"
    if "foreign direct investment" in indicator_name_lower:
        return "Economic"
    if "research" in indicator_name_lower or "r&d" in indicator_name_lower:
        return "Economic"

    return "Other"

# apply to gdp-inflation-fdi data
gdp_long["Category"] = gdp_long["Indicator"].apply(assign_economic_category)

# display a table showing each indicator in the dataset and its assigned category
gdp_long[["Indicator", "Category"]].drop_duplicates().sort_values("Indicator")

In [None]:
# drop rows where Series Name shows "Other" (NaN)
esg_long = esg_long.dropna(subset=["Indicator"])
gdp_long = gdp_long.dropna(subset=["Indicator"])

# display the same tables again to see if it worked
from IPython.display import display

display(esg_long[["Indicator", "Category"]].drop_duplicates().sort_values("Indicator"))
display(gdp_long[["Indicator", "Category"]].drop_duplicates().sort_values("Indicator"))

In [None]:
# --- Merge ESG and gdp-inflation-fdi datasets ---

# first add a source label column that tells from which dataset an indicator comes from (just in case)
esg_long["Source"] = "ESG"
gdp_long["Source"] = "Economic"

# make columns order consistent
common_cols = ["Country Name", "Country Code", "Indicator", "Year", "Value", "Category", "Source"]

esg_long = esg_long[common_cols]
gdp_long = gdp_long[common_cols]

# combine the two datasets into a single long dataframe
all_long = pd.concat([esg_long, gdp_long], ignore_index=True)

# display the first 15 rows of the new merged dataset
all_long.head(15)

In [None]:
# sort the data by Country Code
all_long = all_long.sort_values(["Country Code", "Year", "Category"]).reset_index(drop=True)

# display the first 20 rows
all_long.head(20)

In [None]:
# --- Prepare country-classification table ---

# drop "lending category" column and rename remaining columns to match the other (merged) dataset
class_clean = class_df.rename(
    columns={
        "Economy": "Country Name",
        "Code": "Country Code",
        "Income group": "Income Group",
        "Lending category": "Lending Category"
    }
)[["Country Code", "Country Name", "Region", "Income Group"]]

# display the first 15 rows
class_clean.head(15)

In [None]:
# --- Merge all datasets into one unique dataset

panel_long = all_long.merge(
    class_clean,
    on="Country Code",
    how="left"
)

# display the first 20 rows
panel_long.head(10)

In [None]:
# clean up duplicate country name columns
if "Country Name_x" in panel_long.columns:
    panel_long = panel_long.rename(columns={"Country Name_x": "Country Name"})
    panel_long = panel_long.drop(columns=["Country Name_y"])

# display the first 20 rows
panel_long.head(10)

In [None]:
# Convert Value column to numeric (turn ".." into NaN)
panel_long["Value"] = pd.to_numeric(panel_long["Value"], errors="coerce")

# display the first 20 rows
panel_long.head(10)

In [None]:
# save the (full) merged dataset
output_path = "sustainability-economic-performance/data/processed/panel_full_unfiltered.csv"
panel_long.to_csv(output_path, index=False)