In [None]:
# 1. Setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import math

# Paths
BASE = Path.cwd().parent if (Path.cwd().name == "notebooks") else Path.cwd()
DATA = BASE / "data"
PROC = DATA / "processed"
REPORTS = BASE / "reports"
FIGS = REPORTS / "figures"

for p in [PROC, REPORTS, FIGS]:
    p.mkdir(parents=True, exist_ok=True)

print("Directories Ready.")

In [None]:
# 2. Load Processed Data

file_path = PROC / "analysis_base_with_full_interpolation.csv"
df = pd.read_csv(file_path)
print("Loaded:", file_path)
print("Shape:", df.shape)
print("Years:", df["Year"].min(), "-", df["Year"].max(), "| Countries:", df["Country"].nunique())
df.head()

In [None]:
# 3. Check

na_rates = df.isna().mean().sort_values(ascending=False)
print("NaN Rates by Column:")
print((na_rates * 100).round(2))

In [None]:
# 4. Correlation Analysis

cols = ["fertility", "flfp", "urban_pop", "migration"]

corr = df[cols].corr()
print("Correlation Matrix:")
print(corr)

plt.figure(figsize=(5,4))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix (1960–2018)")
plt.tight_layout()

# Save Figure
out_path = FIGS / "correlation_matrix.png"
plt.savefig(out_path, dpi=300)
print(f"Saved: {out_path}")

plt.show()

In [None]:
# 5. Within-Country Correlation Analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

within_corrs = (
    df[["Country","flfp","urban_pop"]]
      .dropna(subset=["flfp","urban_pop"])
      .groupby("Country")
      .apply(lambda g: g["flfp"].corr(g["urban_pop"]))
      .rename("within_corr")
      .reset_index()
      .dropna(subset=["within_corr"])
)

# Specific Outputs
avg_corr = within_corrs["within_corr"].mean()
med_corr = within_corrs["within_corr"].median()
n_countries = len(within_corrs)
print(f"Countries with Valid Within-Corr: {n_countries}")
print(f"Average Within-Country Correlation: {avg_corr:.3f}")
print(f"Median Within-Country Correlation: {med_corr:.3f}")

# Top/Bottom 10 Countries by Within-Country Correlation
print("\nTop 10 Countries by Within-Country Correlation:")
print(within_corrs.sort_values("within_corr", ascending=False).head(10).to_string(index=False))

print("\nBottom 10 Countries by Within-Country Correlation:")
print(within_corrs.sort_values("within_corr", ascending=True).head(10).to_string(index=False))

# Histogram of Within-Country Correlations
plt.figure(figsize=(8,4))
plt.hist(within_corrs["within_corr"], bins=20)
plt.title("Distribution of Within-Country FLFP–UrbanPop Correlations")
plt.xlabel("Correlation Coefficient (Within-Country)")
plt.ylabel("Frequency")
plt.tight_layout()

# Save Figure
out_path = FIGS / "within_country_correlation_distribution.png"
plt.savefig(out_path, dpi=300)
print(f"Saved: {out_path}")

plt.show()

Interpretation (Summary of Correlations)

1. Global (Pooled) Correlations

Fertility and Urban Population show a strong negative correlation.
→ Countries with higher urbanization tend to have lower fertility rates.

Fertility and Female Labour Force Participation are moderately negative,
suggesting that as more women join the workforce, fertility declines.

Migration has no clear global relationship with the other variables.

2. Within-Country Correlations (Over Time)

Within countries, the link between Fertility and Urbanization becomes even stronger and negative.
→ As nations urbanize over time, fertility tends to fall.

Fertility and Female Labour Participation remain negatively related.

Urbanization and Female Labour Participation are positively related, showing that social and economic development often progress together.

3. Between-Country Correlations

Countries that are more urbanized and have higher female labour participation generally have lower fertility rates.

These patterns reflect long-term structural differences between developed and developing regions.

4. Summary

Overall, the data shows a consistent pattern:
urbanization and female workforce participation are both linked to declining fertility — both across and within countries — while migration plays a minimal role.