In [21]:
"""
ana.py  –  Final Python stage for QTM-350 Economic-Development project
Reads the SQL-cleaned table `econ_cleaned` from economic_data.db
and produces plots + regression outputs for the Quarto report.
--------------------------------------------------------------------
Input schema  (econ_cleaned):
  Country Name | Year | GDP | Employment | GDP_Growth
    str          int    float  float        float
Country list already restricted in SQL step:
  China, Japan, South Korea, United States, United Kingdom, Canada
"""

# ============== 0.  SETUP ====================================================
import sqlite3
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pathlib import Path
import pycountry                     # pip install pycountry

# ---- output folders ---------------------------------------------------------
OUT_FIG = Path("figures")
OUT_DAT = Path("processed")
OUT_FIG.mkdir(exist_ok=True, parents=True)
OUT_DAT.mkdir(exist_ok=True, parents=True)

INDICATORS = ["GDP", "Employment", "GDP_Growth"]

# ============== 1.  LOAD CLEAN PANEL =========================================
DB_PATH = "/Users/cristianonie/Desktop/QTM350-GDP-Project/script/economic_data.db"
with sqlite3.connect(DB_PATH) as con:
    df = pd.read_sql_query("SELECT * FROM econ_cleaned", con)

if df.empty:
    raise RuntimeError("econ_cleaned table is empty – check SQL pipeline.")

# ============== 2.  ENRICH  – add ISO-3 codes  ===============================
def name_to_iso3(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except LookupError:
        # handle special cases
        mapping = {
            "South Korea": "KOR",
            "United States": "USA",
            "United Kingdom": "GBR"
        }
        return mapping.get(name, None)

df["iso3"] = df["Country Name"].apply(name_to_iso3)
if df["iso3"].isna().any():
    missing = df[df["iso3"].isna()]["Country Name"].unique()
    raise ValueError(f"ISO-3 lookup failed for: {missing}")

# ============== 3.  BASIC CLEANING ===========================================
df[INDICATORS] = df[INDICATORS].apply(pd.to_numeric, errors="coerce")

# forward-fill within each country to patch occasional NaNs
df = (df.sort_values(["iso3", "Year"])
        .groupby("iso3")
        .apply(lambda g: g.ffill())
        .reset_index(drop=True))


# ============== 5.  FIXED-EFFECTS REGRESSION ================================
panel = df.dropna(subset=["GDP", "Employment"]).copy()
panel["log_GDP"] = np.log(panel["GDP"])

# de-mean within each country
panel["empl_w"]   = panel["Employment"] - panel.groupby("iso3")["Employment"].transform("mean")
panel["loggdp_w"] = panel["log_GDP"]    - panel.groupby("iso3")["log_GDP"].transform("mean")

X  = sm.add_constant(panel["empl_w"])
mod = sm.OLS(panel["loggdp_w"], X).fit(cov_type="cluster",
                                       cov_kwds={"groups": panel["iso3"]})

print(mod.summary())
with open(OUT_DAT / "employment_fe_regression.txt", "w") as f:
    f.write(mod.summary().as_text())

# ============== 6.  CORRELATION HEAT-MAP ====================================
corr = df[INDICATORS].corr(method="pearson").round(2)
corr.to_csv(OUT_DAT / "indicator_corr.csv")

plt.figure()
plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.colorbar(label="Pearson r")
plt.xticks(range(len(corr)), corr.columns, rotation=45, ha="right")
plt.yticks(range(len(corr)), corr.index)
plt.title("Indicator correlation (1990-2023, pooled)")
plt.tight_layout()
plt.savefig(OUT_FIG / "indicator_corr_heatmap.png", dpi=300)
plt.close()

# ============== 7.  EXPORT TIDY PANEL =======================================
df.to_csv(OUT_DAT / "economic_dev_panel.csv", index=False)
print("🏁  Analysis complete – outputs saved to 'figures/' and 'processed/'.")


  .apply(lambda g: g.ffill())


                            OLS Regression Results                            
Dep. Variable:               loggdp_w   R-squared:                       0.471
Model:                            OLS   Adj. R-squared:                  0.470
Method:                 Least Squares   F-statistic:                     2.913
Date:                Sun, 27 Apr 2025   Prob (F-statistic):              0.163
Time:                        23:48:29   Log-Likelihood:                -48.139
No. Observations:                 495   AIC:                             100.3
Df Residuals:                     493   BIC:                             108.7
Df Model:                           1                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -9.541e-17   3.41e-16     -0.280      0.7

In [22]:
# ============== 4.  DESCRIPTIVE PLOTS ========================================
def lineplot(ind_col, ylab):
    plt.figure()
    for iso, g in df.groupby("iso3"):
        plt.plot(g["Year"], g[ind_col], label=iso)
    plt.title(ylab)
    plt.xlabel("Year")
    plt.ylabel(ylab)
    plt.legend()
    plt.tight_layout()
    plt.savefig(OUT_FIG / f"{ind_col}_trend.png", dpi=300)
    plt.close()

lineplot("GDP",         "GDP per capita (constant 2015 US$)")


In [23]:
lineplot("GDP_Growth",  "GDP growth rate (%)")


In [24]:
lineplot("Employment",  "Employment-to-population 15+ (%)")


In [25]:
# ---------- Plot 1: Employment vs log-GDP scatter ----------------------------
import seaborn as sns  # if not installed: pip install seaborn

plt.figure(figsize=(6,4))
sns.regplot(data=panel, x="Employment", y="log_GDP",
            scatter_kws={"s":18}, line_kws={"lw":1.5}, ci=None)
for iso, g in panel.groupby("iso3"):
    plt.scatter(g["Employment"], g["log_GDP"], label=iso, s=18)
plt.xlabel("Employment-to-population ratio (%)")
plt.ylabel("log GDP per capita (2015 US$)")
plt.title("Employment vs log GDP per capita, 1990-2023")
plt.legend(markerscale=1, fontsize=7, frameon=False)
plt.tight_layout()
plt.savefig(OUT_FIG / "scatter_empl_loggdp.png", dpi=300)
plt.close()

# ---------- Plot 2: Country-faceted GDP-growth -------------------------------
g = sns.FacetGrid(df, col="iso3", col_wrap=3, height=2.2, aspect=1.4,
                  sharey=False, sharex=True)
g.map_dataframe(sns.lineplot, x="Year", y="GDP_Growth")
g.set_axis_labels("", "GDP growth (%)")
g.set_titles("{col_name}")
g.fig.suptitle("GDP-growth trajectories, 1990-2023", y=1.02, fontsize=12)
plt.tight_layout()
g.savefig(OUT_FIG / "facet_gdp_growth.png", dpi=300)
plt.close()
