In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# ===================== Paths =====================
RAW_FILE = Path("../data_raw/Core_economic_indicators.csv")
OUT_FILE = Path("../data_clean/economic_clean.csv")
OUT_FILE.parent.mkdir(parents=True, exist_ok=True)

# ===================== 1) Load =====================
eco_raw = pd.read_csv(RAW_FILE)
eco_raw = eco_raw.replace({"..": np.nan, "": np.nan})


# Standardize column names
eco_raw = eco_raw.rename(columns={
    "Country Name": "country_name",
    "Country Code": "country_code",
    "Series Name": "series_name",
    "Series Code": "series_code"
})

# ===================== 2) Identify year columns =====================
year_cols = [c for c in eco_raw.columns if ("[YR" in c) or c.strip().isdigit()]

# ===================== 3) Melt to long =====================
eco_long = eco_raw.melt(
    id_vars=["country_name", "country_code", "series_name", "series_code"],
    value_vars=year_cols,
    var_name="year_raw",
    value_name="value"
)

# Extract numeric year
eco_long["year"] = eco_long["year_raw"].str.extract(r"(\d{4})").astype(int)
eco_long = eco_long.drop(columns=["year_raw"])

# ===================== 4) Select important indicators =====================
wanted_map = {
    "GDP (current US$)": "gdp_usd",
    "GDP per capita (current US$)": "gdp_per_capita_usd",
    "GDP growth (annual %)": "gdp_growth_pct",
    "Inflation, consumer prices (annual %)": "inflation_pct",
    "Trade (% of GDP)": "trade_pct_gdp",
    "Exports of goods and services (% of GDP)": "exports_pct_gdp",
    "Imports of goods and services (% of GDP)": "imports_pct_gdp"
}

# Keep only those present in the data
present = set(eco_long["series_name"].unique())
wanted_map = {k: v for k, v in wanted_map.items() if k in present}

eco_long = eco_long[eco_long["series_name"].isin(wanted_map.keys())]
eco_long["indicator"] = eco_long["series_name"].map(wanted_map)

# Convert value to numeric
eco_long["value"] = pd.to_numeric(eco_long["value"], errors="coerce")

# ===================== 5) Pivot to wide =====================
eco_wide = eco_long.pivot_table(
    index=["country_name", "country_code", "year"],
    columns="indicator",
    values="value",
    aggfunc="mean"
).reset_index()

# ===================== 6) Unit conversions =====================
if "gdp_usd" in eco_wide.columns:
    eco_wide["gdp_usd"] = eco_wide["gdp_usd"] / 1_000_000  # millions USD
if "gdp_per_capita_usd" in eco_wide.columns:
    eco_wide["gdp_per_capita_usd"] = eco_wide["gdp_per_capita_usd"] / 1_000_000

# ===================== 7) Missing values handling =====================
value_cols = [c for c in eco_wide.columns if c not in ["country_name", "country_code", "year"]]

# Sort before filling
eco_wide = eco_wide.sort_values(["country_name", "year"]).reset_index(drop=True)

# Interpolate small gaps (≤ 2 years)
eco_wide[value_cols] = (
    eco_wide.groupby("country_name", group_keys=False)[value_cols]
    .transform(lambda g: g.interpolate(limit=2))
)

# Optional: Clamp percentages and indexes to valid ranges
pct_cols = [c for c in value_cols if c.endswith("_pct")]
eco_wide[pct_cols] = eco_wide[pct_cols].clip(lower=0, upper=100)

# ===================== 8) Save clean file =====================
eco_clean = eco_wide.rename(columns={"country_name": "country"})
eco_clean = eco_clean.drop_duplicates(subset=["country", "year"])
eco_clean.to_csv(OUT_FILE, index=False)

print(f"Saved cleaned economic data to {OUT_FILE}")
print("Shape:", eco_clean.shape)
print("Missing values per column:\n", eco_clean.isna().sum())


Saved cleaned economic data to ..\data_clean\economic_clean.csv
Shape: (625, 10)
Missing values per column:
 indicator
country                0
country_code           0
year                   0
exports_pct_gdp       23
gdp_growth_pct         1
gdp_per_capita_usd     0
gdp_usd                0
imports_pct_gdp       23
inflation_pct         31
trade_pct_gdp         23
dtype: int64


In [None]:
import pandas as pd
import warnings

# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# ===== 1. Load the dataset =====
raw_path = "../data_raw/Employment_Unemployment.csv"  # change path if needed
df = pd.read_csv(raw_path, na_values=[".."])  # ".." → NaN

# ===== 2. Rename key columns for consistency =====
df = df.rename(columns={
    "Country Name": "country",
    "Country Code": "country_code",
    "Series Name": "indicator",
    "Series Code": "indicator_code"
})

# ===== 3. Detect year columns =====
year_cols = [c for c in df.columns if c.startswith("20") and "[YR" in c]

# Clean year names to just integer year
rename_years = {c: int(c.split()[0]) for c in year_cols}
df = df.rename(columns=rename_years)

# ===== 4. Reshape to long format =====
df_long = df.melt(
    id_vars=["country", "country_code", "indicator", "indicator_code"],
    value_vars=sorted(rename_years.values()),
    var_name="year",
    value_name="value"
)

# ===== 5. Handle missing values =====
# Forward/backward fill for gaps ≤ 1 year
df_long["value"] = (
    df_long.groupby(["country", "indicator"])["value"]
           .transform(lambda g: g.ffill(limit=1).bfill(limit=1))
)

# Optional: interpolate remaining missing values across years
df_long["value"] = (
    df_long.groupby(["country", "indicator"])["value"]
           .transform(lambda g: g.interpolate(method="linear", limit_direction="both"))
)

# ===== 6. Convert % values to decimals =====
percent_mask = df_long["indicator"].str.contains("%", na=False)
df_long.loc[percent_mask, "value"] = df_long.loc[percent_mask, "value"] / 100

# ===== 7. Remove duplicates =====
df_long = df_long.drop_duplicates(subset=["country", "year", "indicator"])

# ===== 8. Sort for neatness =====
df_long = df_long.sort_values(["country", "year", "indicator"]).reset_index(drop=True)

# ===== 9. Save cleaned file =====
out_path = "../data_clean/employment_clean.csv"
df_long.to_csv(out_path, index=False)

# ===== 10. Summary =====
print(f"Cleaned employment data saved to {out_path}")
print("Shape:", df_long.shape)
print("Missing values per column:\n", df_long.isna().sum())


✅ Cleaned employment data saved to ../data_clean/employment_clean.csv
Shape: (11950, 6)
Missing values per column:
 country            25
country_code       75
indicator          75
indicator_code     75
year                0
value             150
dtype: int64


In [None]:
import pandas as pd
import warnings

# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# ===== 1. Load the dataset =====
raw_path = "../data_raw/Social_and_welfare.csv"  # path to uploaded file
df = pd.read_csv(raw_path, na_values=["..", "NA", "NaN"])

# ===== 2. Rename key columns for consistency =====
df = df.rename(columns={
    "Country Name": "country",
    "Country Code": "country_code",
    "Series Name": "indicator",
    "Series Code": "indicator_code"
})

# ===== 3. Detect year columns =====
year_cols = [c for c in df.columns if c.startswith("20") and "[YR" in c]

# Clean year names to just integer year
rename_years = {c: int(c.split()[0]) for c in year_cols}
df = df.rename(columns=rename_years)

# ===== 4. Reshape to long format =====
df_long = df.melt(
    id_vars=["country", "country_code", "indicator", "indicator_code"],
    value_vars=sorted(rename_years.values()),
    var_name="year",
    value_name="value"
)

# ===== 5. Handle missing values =====
# Forward/backward fill for small gaps (≤1 year)
df_long["value"] = (
    df_long.groupby(["country", "indicator"])["value"]
           .transform(lambda g: g.ffill(limit=1).bfill(limit=1))
)

# Interpolate remaining numeric gaps
df_long["value"] = (
    df_long.groupby(["country", "indicator"])["value"]
           .transform(lambda g: g.interpolate(method="linear", limit_direction="both"))
)

# ===== 6. Convert percentages to decimals =====
percent_mask = df_long["indicator"].str.contains("%", na=False)
df_long.loc[percent_mask, "value"] = df_long.loc[percent_mask, "value"] / 100

# ===== 7. Remove duplicates =====
df_long = df_long.drop_duplicates(subset=["country", "year", "indicator"])

# ===== 8. Sort for neatness =====
df_long = df_long.sort_values(["country", "year", "indicator"]).reset_index(drop=True)

# ===== 9. Save cleaned file =====
out_path = "../data_clean/social_welfare_clean.csv"
df_long.to_csv(out_path, index=False)

# ===== 10. Summary =====
print(f"Cleaned Social & Welfare data saved to {out_path}")
print("Shape:", df_long.shape)
print("Missing values per column:\n", df_long.isna().sum())


✅ Cleaned Social & Welfare data saved to ../data_clean/social_welfare_clean.csv
Shape: (5075, 6)
Missing values per column:
 country            25
country_code       75
indicator          75
indicator_code     75
year                0
value             175
dtype: int64
