In [None]:
import pandas as pd
import re
from pathlib import Path

# 1) Read raw HDR file
df = pd.read_csv(
    r"D:\DS-Project\Demographic-Change_Understanding-Fertility-Through-Data\data\raw\HDR25.csv",
    encoding="latin1"
)

# 2) Country sütunu ilk sütun olsun
country_col = df.columns[0]

# 3) hdi_ ile başlayan sütunları bul
hdi_cols = [c for c in df.columns if str(c).lower().startswith("hdi_")]

# Eğer bazıları HDI_1990, bazıları hdi_1990 ise sorun olmasın diye:
# hdi_1990 -> 1990
col_map = {}
for c in hdi_cols:
    # içindeki yılı çek (hdi_1990 -> 1990)
    year_match = re.search(r"(19|20)\d{2}", c)
    if year_match:
        year = year_match.group(0)
        col_map[c] = year

# 4) Uzun formata çevir
df_long = df.melt(
    id_vars=[country_col],
    value_vars=list(col_map.keys()),
    var_name="hdi_col",
    value_name="HDI"
)

# 5) Year sütununu çıkar
df_long["Year"] = df_long["hdi_col"].map(col_map).astype(int)

# 6) Sütun adlarını düzenle
df_long = df_long.rename(columns={country_col: "Country"})

# 7) 1990–2018 aralığını tut
df_long = df_long[df_long["Year"].between(1990, 2018)]

# 8) Boş HDI'ları at
df_long = df_long.dropna(subset=["HDI"])

# 9) Sadece lazım olan 3 sütun
df_hdi_long = df_long[["Country", "Year", "HDI"]].sort_values(["Country", "Year"])

# 10) Kaydet
BASE_DIR = Path(r"D:\DS-Project\Demographic-Change_Understanding-Fertility-Through-Data")
PROC_DIR = BASE_DIR / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

out_path = PROC_DIR / "hdi_1990_2018_long.csv"
df_hdi_long.to_csv(out_path, index=False)

print("Saved:", out_path)
print("Rows:", len(df_hdi_long))
print(df_hdi_long.head(10))


In [None]:
out_path = PROC_DIR / "net_migration_1960_2018_long.csv"
df_mig_long.to_csv(out_path, index=False)
print("Saved:", out_path.exists(), "| Rows:", len(df_mig_long), "| File:", out_path)


In [None]:
# Keep rows where Country Code looks like a 3-letter ISO3 code
df_mig_long = df_mig_long[df_mig_long["Country Code"].str.fullmatch(r"[A-Z]{3}", na=False)]

# Drop rows with missing metric
df_mig_long = df_mig_long.dropna(subset=["NetMigration"])


In [None]:
df_mig_long = df_mig.melt(
    id_vars=id_cols,
    var_name="Year",
    value_name="NetMigration"
)

df_mig_long["Year"] = df_mig_long["Year"].astype(int)
df_mig_long["NetMigration"] = pd.to_numeric(df_mig_long["NetMigration"], errors="coerce")


In [None]:
id_cols = ["Country Name", "Country Code"]
year_cols = [str(y) for y in range(1960, 2019)]  # up to 2018 inclusive
available_years = [c for c in year_cols if c in df_mig_raw.columns]

df_mig = df_mig_raw[id_cols + available_years].copy()
df_mig.head()


In [None]:
df_mig_raw = pd.read_csv(migration_csv, skiprows=4)  # first 4 rows are notes
df_mig_raw.shape, df_mig_raw.columns[:8]


In [None]:
from pathlib import Path
import pandas as pd

BASE_DIR = Path("..").resolve()
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)


migration_csv = RAW_DIR / "SM.POP.NETM.csv"
print("Exists:", migration_csv.exists(), "|", migration_csv)


In [None]:
out_path = PROC_DIR / "urbanization_1960_2018_long.csv"
df_long.to_csv(out_path, index=False)
print("Saved:", out_path.exists(), "| File:", out_path)


In [None]:
# Build year columns 1960–2018
year_cols = []
for y in range(1960, 2019):
    if y in countries.columns:
        year_cols.append(y)
    elif float(y) in countries.columns:
        year_cols.append(float(y))

id_cols = ["Country Name", "UN_Country_Code"]
wide = countries[id_cols + year_cols].copy()

# Wide -> Long
df_long = wide.melt(id_vars=id_cols, var_name="Year", value_name="UrbanPercent")
df_long["Year"] = df_long["Year"].astype(int)
df_long["UrbanPercent"] = pd.to_numeric(df_long["UrbanPercent"], errors="coerce")

# Drop missing values in the metric
df_long = df_long.dropna(subset=["UrbanPercent"])

df_long.head(), df_long.shape


In [None]:
# Keep rows that have a numeric UN country code
df_un = df_un[~df_un["UN_Country_Code"].isna()].copy()
df_un["UN_Country_Code"] = pd.to_numeric(df_un["UN_Country_Code"], errors="coerce")

# Keep only country-level rows (exclude regional aggregates)
countries = df_un[df_un["UN_Country_Code"].between(0, 899, inclusive="both")].copy()

print("Rows (all):", len(df_un), "| Countries-only:", len(countries))
countries[["Country Name", "UN_Country_Code"]].head(5)


In [None]:
import pandas as pd

# Load raw (no header first)
raw = pd.read_excel(urban_xls, sheet_name="Data", header=None)

# Find header row (the one that contains 'Index' and 'Region, subregion, country or area')
hdr_idx = None
for i in range(len(raw)):
    row = raw.iloc[i].astype(str).tolist()
    if "Index" in row and any("Region, subregion, country or area" in c for c in row):
        hdr_idx = i
        break

print("Header row index:", hdr_idx)
assert hdr_idx is not None, "Header row not found. Check sheet_name or file."

# Build dataframe with proper columns
columns = raw.iloc[hdr_idx].tolist()
df_un = raw.iloc[hdr_idx+1:].copy()
df_un.columns = columns

# Drop all-empty columns
df_un = df_un.dropna(axis=1, how="all")

# Rename key columns for consistency
df_un = df_un.rename(columns={
    "Region, subregion, country or area": "Country Name",
    "Country\ncode": "UN_Country_Code",
    "Note": "Note"
})

df_un.head(3)


In [None]:
from pathlib import Path
import pandas as pd

# Paths
BASE_DIR = Path("..").resolve()
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

#
urban_xls = RAW_DIR / "POPDBWUPRev.20181F21.xls"
print("Exists:", urban_xls.exists(), "|", urban_xls)


In [None]:
out_path = PROC_DIR / "female_labor_1960_2018_long.csv"
df_labor_long.to_csv(out_path, index=False)
print("Saved:", out_path.exists(), "| Rows:", len(df_labor_long))



In [None]:
df_labor_long = df_labor.melt(
    id_vars=id_cols,
    var_name="Year",
    value_name="FemaleLaborParticipation"
)

df_labor_long["Year"] = df_labor_long["Year"].astype(int)
df_labor_long["FemaleLaborParticipation"] = pd.to_numeric(df_labor_long["FemaleLaborParticipation"], errors="coerce")

df_labor_long.dropna(subset=["FemaleLaborParticipation"], inplace=True)
df_labor_long.head()


In [None]:
id_cols = ["Country Name", "Country Code"]
year_cols = [str(y) for y in range(1960, 2019)]

available_years = [c for c in year_cols if c in df_labor_raw.columns]
df_labor = df_labor_raw[id_cols + available_years].copy()
df_labor.head()



In [None]:
df_labor_raw = pd.read_csv(labor_csv, skiprows=4)
df_labor_raw.shape, df_labor_raw.columns[:10]


In [None]:
from pathlib import Path
import pandas as pd

# Paths
BASE_DIR = Path("..").resolve()
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

# Female Labor Force CSV
labor_csv = RAW_DIR / "SL.TLF.CACT.FE.ZS.csv" 

print(labor_csv.exists(), labor_csv)


In [None]:
from pathlib import Path

# ".." ile bir üst klasöre çıkıyoruz
BASE_DIR = Path("..").resolve()
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

PROC_DIR.mkdir(parents=True, exist_ok=True)

print("RAW_DIR exists:", RAW_DIR.exists(), "| Path:", RAW_DIR)


In [None]:
from pathlib import Path
import pandas as pd

# 1) Doğru klasör yolu
BASE_DIR = Path("..").resolve()
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)


fertility_csv = RAW_DIR / "SP.DYN.TFRT.IN.csv"


print(fertility_csv.exists(), fertility_csv)


In [None]:
df_raw = pd.read_csv(fertility_csv, skiprows=4)
df_raw.shape


In [None]:
df_raw.head()


In [None]:
# Keep only what we need: Country Name, Country Code, Years (1960–2018)
id_cols = ["Country Name", "Country Code"]
year_cols = [str(y) for y in range(1960, 2019)]  # 1960–2018 dahil

# Bazı yıllar dosyada yoksa (örneğin 2018 sonrası), sadece olanları seç
available_years = [c for c in year_cols if c in df_raw.columns]

df = df_raw[id_cols + available_years].copy()
df.head()


In [None]:
# Convert wide -> long format
df_long = df.melt(
    id_vars=id_cols,
    var_name="Year",
    value_name="FertilityRate"
)

# Türleri düzelt
df_long["Year"] = df_long["Year"].astype(int)
df_long["FertilityRate"] = pd.to_numeric(df_long["FertilityRate"], errors="coerce")

df_long.head()


In [None]:
df_long = df_long.dropna(subset=["FertilityRate"])

output_path = PROC_DIR / "fertility_1960_2018_long.csv"
df_long.to_csv(output_path, index=False)

print("Saved:", output_path.exists(), "| Rows:", len(df_long))
