In [None]:
# 1. Setup (Defining Paths)

from pathlib import Path
import pandas as pd
import numpy as np

BASE = Path.cwd().parent if (Path.cwd().name == "notebooks") else Path.cwd()
DATA = BASE / "data"
PROC = DATA / "processed"
REPORTS = BASE / "reports"
FIGS = REPORTS / "figures"
for p in [PROC, REPORTS, FIGS]:
    p.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", 100)

In [None]:
# 2. Loading Merged Data

merged_path = PROC / "merged_panel_1960_2018.csv"
df = pd.read_csv(merged_path)
df.columns = df.columns.str.strip()

df.head()
df.info()

In [None]:
# 3. Analising Missing Data

# Year Interval and Countries
year_min, year_max = df["Year"].min(), df["Year"].max()
n_countries = df["Country"].nunique()
print(f"Years: {year_min}-{year_max} | Countries: {n_countries} | Rows: {len(df)}")

# NaN Rates per Feature
na_rates = df[['fertility', 'flfp', 'urban_pop', 'migration']].isna().mean().sort_values(ascending=False)
print(na_rates)

# Detecting Completely Empty rows
full = (
    pd.MultiIndex.from_product(
        [df["Country"].unique(), np.arange(1960, 2019)],
        names=["Country", "Year"]
    ).to_frame(index=False)
)
df_full = full.merge(df, on=["Country", "Year"], how="left")

missing_rows = df_full[
    df_full['fertility'].isna() &
    df_full['flfp'].isna() &
    df_full['urban_pop'].isna() &
    df_full['migration'].isna()
]
print("Completely Empty Rows (All Features NaN):", len(missing_rows))

In [None]:
# 4. Interpolation of Missing Values (Particularly Female Labour Force Participation)

import numpy as np
import pandas as pd

# Locking Country Labels because of Special Characters
orig_countries = pd.Index(df["Country"].unique())
df_safe = df.copy()
df_safe["Country"] = pd.Categorical(df_safe["Country"], categories=orig_countries, ordered=False)

# Full Grid: MultiIndex + Reindex (Using Merge Instead of Reindex)
years = np.arange(1960, 2019)
full_index = pd.MultiIndex.from_product([orig_countries, years], names=["Country", "Year"])

# Panel with set_index -> reindex -> reset_index
df2 = (
    df_safe
    .set_index(["Country", "Year"])
    .reindex(full_index)
    .sort_index()
    .reset_index()
)

# Numerical Columns
value_cols = ["fertility", "flfp", "urban_pop", "migration"]
for c in value_cols:
    df2[c] = pd.to_numeric(df2[c], errors="coerce")

# Linear Interpolation within each Country
df2 = df2.set_index(["Country", "Year"]).sort_index()

value_cols = ["fertility", "flfp", "urban_pop", "migration"]

# Transforming Column by Column
for col in value_cols:
    df2[col] = (
        df2.groupby(level=0)[col]
           .transform(lambda s: s.interpolate(method="linear", limit_direction="both"))
    )

# Reset Index
df_interp = df2.reset_index()

# Check Country Labels
assert set(df_interp["Country"].unique()) == set(orig_countries), "Country Labels Changed Unexpectedly!"

# NaN Rates After Interpolation
na_after = df_interp[value_cols].isna().mean().sort_values(ascending=False)
print("NaN rates after interpolation:\n", na_after)

# Save the Interpolated Data
out_path = PROC / "analysis_base_with_interpolation.csv"
df_interp.to_csv(out_path, index=False, encoding="utf-8-sig")
print("Saved:", out_path)

In [None]:
# 5. Check for Missing Values After Interpolation (Particularly for Countries that Had 0 Data about Female Labour Force Participation)

na_counts = df_interp[["fertility", "flfp", "urban_pop", "migration"]].isna().sum()
na_share = df_interp[["fertility", "flfp", "urban_pop", "migration"]].isna().mean()
pd.DataFrame({"NaN Count": na_counts, "NaN %": (na_share*100).round(2)})

In [None]:
# 6. Fill the Remaining NaNs (Hierarchical Median Strategy)

import numpy as np
import pandas as pd

df_filled = df_interp.copy()
fill_cols = ["fertility", "flfp", "urban_pop", "migration"]

for col in fill_cols:
    # Year Median
    year_median = df_filled.groupby("Year")[col].transform("median")
    global_median = df_filled[col].median()

    # Mask
    need_fill = df_filled[col].isna()
    df_filled.loc[need_fill, col] = year_median[need_fill]
    df_filled.loc[df_filled[col].isna(), col] = global_median

# Check
na_after = df_filled[fill_cols].isna().mean().sort_values(ascending=False)
print("NaN Rates After Global-Year Median Fill:\n", na_after)

# Save
out_path = PROC / "analysis_base_with_full_interpolation.csv"
df_filled.to_csv(out_path, index=False, encoding="utf-8-sig")
print("Saved:", out_path)

In [None]:
# 7. Build Pre-Interpolation Panel, then Flag Interpolated vs Median-Filled Rows

import numpy as np
import pandas as pd

# Lock Country Labels and Build Full Grid (No Interpolation)
orig_countries = pd.Index(df["Country"].unique())
years = np.arange(1960, 2019)

pre_panel = (
    df.copy()
      .assign(Country=lambda d: pd.Categorical(d["Country"], categories=orig_countries, ordered=False))
      .set_index(["Country", "Year"])
      .reindex(pd.MultiIndex.from_product([orig_countries, years], names=["Country","Year"]))
      .sort_index()
      .reset_index()
)

# Masks:
# - pre_panel: Before Interpolation
# - df_interp: After Interpolation
# - df_filled: After Median Filling

# Check Alignment (Country,Year)
def _key(df_): return df_[["Country","Year"]].astype({"Country":"string","Year":"int64"})
assert _key(pre_panel).equals(_key(df_interp)), "pre_panel vs df_interp row order mismatch"
assert _key(pre_panel).equals(_key(df_filled)), "pre_panel vs df_filled row order mismatch"

# Flags for "Female Labour Force Participation"
pre_nan      = pre_panel["flfp"].isna()
post_interp  = df_interp["flfp"].notna()
post_interp_nan = df_interp["flfp"].isna()
post_filled  = df_filled["flfp"].notna()

# Rows that were Interpolated
flfp_interpolated_flag = (pre_nan & post_interp).astype(int)

# Rows that were Median Filled
flfp_median_flag = (post_interp_nan & post_filled).astype(int)

# Add Flags to Final DataFrame
df_flagged = df_filled.copy()
df_flagged["flfp_interpolated"]  = flfp_interpolated_flag
df_flagged["flfp_median_filled"] = flfp_median_flag

# Source Column
def _source_row(row):
    if row["flfp_median_filled"] == 1:
        return "median"
    if row["flfp_interpolated"] == 1:
        return "interpolated"
    return "original"

df_flagged["flfp_fill_source"] = df_flagged[["flfp_interpolated","flfp_median_filled"]].apply(_source_row, axis=1)

# Summary
print("Counts:")
print(df_flagged[["flfp_interpolated","flfp_median_filled"]].sum())
print("\nDistribution of Source:")
print(df_flagged["flfp_fill_source"].value_counts())

In [None]:
# 8. Summary Report: Interpolated vs Median-Filled Rows by Country

import pandas as pd
import matplotlib.pyplot as plt

YEARS_PER_COUNTRY = 2018 - 1960 + 1  # 59

# Interpolated Rows Report by Country
interp_report = (
    df_flagged.groupby("Country", as_index=False)["flfp_interpolated"]
              .sum()
              .rename(columns={"flfp_interpolated":"interpolated_rows"})
              .assign(interpolated_share=lambda d: d["interpolated_rows"]/YEARS_PER_COUNTRY)
              .sort_values("interpolated_rows", ascending=False)
)

# Median-Filled Rows Report by Country
median_report = (
    df_flagged.groupby("Country", as_index=False)["flfp_median_filled"]
              .sum()
              .rename(columns={"flfp_median_filled":"median_rows"})
              .assign(median_share=lambda d: d["median_rows"]/YEARS_PER_COUNTRY)
              .sort_values("median_rows", ascending=False)
)

# General Summary
tot_interp = int(df_flagged["flfp_interpolated"].sum())
tot_median = int(df_flagged["flfp_median_filled"].sum())
tot_rows   = len(df_flagged)

print(f"Total Rows: {tot_rows:,}")
print(f"Interpolated Rows: {tot_interp:,}  ({tot_interp/tot_rows:.2%})")
print(f"Median-Filled Rows: {tot_median:,}  ({tot_median/tot_rows:.2%})\n")

print("Top 20 Countries by Interpolated Rows:")
print(
    interp_report.head(20)[["Country","interpolated_rows","interpolated_share"]]
    .assign(interpolated_share=lambda d: (d["interpolated_share"]*100).round(2).astype(str)+"%")
    .to_string(index=False)
)

print("\nTop 8 Countries by Median-Filled Rows:")
print(
    median_report.head(8)[["Country","median_rows","median_share"]]
    .assign(median_share=lambda d: (d["median_share"]*100).round(2).astype(str)+"%")
    .to_string(index=False)
)

# Save
interp_csv = REPORTS / "flfp_interpolated_by_country.csv"
median_csv = REPORTS / "flfp_median_by_country.csv"
interp_report.to_csv(interp_csv, index=False, encoding="utf-8-sig")
median_report.to_csv(median_csv, index=False, encoding="utf-8-sig")
print("\nSaved:", interp_csv)
print("Saved:", median_csv)

# Graphics
plt.figure(figsize=(8,6))
plt.barh(interp_report.head(20)["Country"], interp_report.head(20)["interpolated_rows"])
plt.gca().invert_yaxis()
plt.title("FLFP – Interpolated Rows by Country (Top 20)")
plt.xlabel("Rows Interpolated (out of 59)")
plt.tight_layout()

# Save Figure
file1 = FIGS / "flfp_interpolated_rows_top20.png"
plt.savefig(file1, dpi=300)
print(f"Saved: {file1}")
plt.show()

plt.figure(figsize=(8,6))
plt.barh(median_report.head(8)["Country"], median_report.head(8)["median_rows"])
plt.gca().invert_yaxis()
plt.title("FLFP – Median-Filled Rows by Country (Top 8)")
plt.xlabel("Rows Median-Filled (out of 59)")
plt.tight_layout()

# Save Figure
file2 = FIGS / "flfp_median_filled_rows_top8.png"
plt.savefig(file2, dpi=300)
print(f"Saved: {file2}")
plt.show()

