In [2]:
import pandas as pd
import re
from pathlib import Path

raw_path = "LFS2/14100064.csv"
out_path = "LFS2/LFS_wages_sex_age_immigrant_2015_2024_clean.csv"
Path("LFS2/clean").mkdir(parents=True, exist_ok=True)

df = pd.read_csv(raw_path, low_memory=False)
print(df.columns.tolist())




['REF_DATE', 'GEO', 'DGUID', 'Wages', 'Type of work', 'North American Industry Classification System (NAICS)', 'Gender', 'Age group', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']


In [None]:

# Load
df = pd.read_csv(raw_path, low_memory=False)

# Identify key columns
date_col = "REF_DATE"
geo_col = "GEO"
char_col = "Characteristics"
val_col = "VALUE"

# Convert to year
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df["year"] = df[date_col].dt.year
df = df[(df["year"] >= 2015) & (df["year"] <= 2024)]

# Province map
province_map = {
    "Canada": "National",
    "Newfoundland and Labrador": "NL", "Prince Edward Island": "PE",
    "Nova Scotia": "NS", "New Brunswick": "NB", "Quebec": "QC",
    "Ontario": "ON", "Manitoba": "MB", "Saskatchewan": "SK",
    "Alberta": "AB", "British Columbia": "BC",
    "Yukon": "YT", "Northwest Territories": "NT", "Nunavut": "NU",
}
df["province"] = df[geo_col].replace(province_map)

# Ensure numeric wages
df["wage_hourly"] = pd.to_numeric(df[val_col], errors="coerce")

# ---- Split Characteristics ----
def extract_sex(x):
    if "men" in x.lower(): return "Men"
    if "women" in x.lower(): return "Women"
    if "sexes" in x.lower(): return "Both"
    return None

def extract_age(x):
    if re.search(r"15.*24", x): return "15-24"
    if re.search(r"25.*54", x): return "25-54"
    if re.search(r"55", x): return "55+"
    if "15 years and over" in x: return "15+"
    return None

def extract_immigrant(x):
    if "immigrant" in x.lower(): return "Immigrants"
    if "born in canada" in x.lower(): return "Born in Canada"
    return None

df["sex"] = df[char_col].map(extract_sex)
df["age_group"] = df[char_col].map(extract_age)
df["immigrant_status"] = df[char_col].map(extract_immigrant)

# Keep only rows where at least one dimension was captured
df = df.dropna(subset=["sex","age_group","immigrant_status"], how="all")

# Aggregate monthly → annual mean
annual = (df.groupby(["year","province","sex","age_group","immigrant_status"])
            ["wage_hourly"].mean().reset_index())

# Save
annual.to_csv(out_path, index=False)
print(f"[SAVED] {out_path}")
print(annual.head())
