# Column Discovery Notebook - ALL Data Sources

**Purpose**: Test all data sources from SAS analysis for bronze layer availability.

**Sources**: Based on `sas_entry_tables_analysis.md`
- **Monthly**: 7+ file patterns
- **Reference**: 20+ files
- **AZEC Legacy**: 8+ files

---

## Setup

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from azfr_fsspec_utils import fspath
import azfr_fsspec_abfs

azfr_fsspec_abfs.use()

spark = SparkSession.builder \
    .appName("Column_Discovery_Full") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print(f"✓ Spark {spark.version} initialized")
print(f"✓ Azure connection configured")

In [None]:
# UPDATE WITH YOUR ACTUAL PATH
DATALAKE_BASE = "abfss://datamart@yourdatalake.dfs.core.windows.net/CONSTRUCTION"

VISION = "202509"
YEAR = VISION[:4]
MONTH = VISION[4:6]

MONTHLY_PATH = f"{DATALAKE_BASE}/bronze/{YEAR}/{MONTH}"
REF_PATH = f"{DATALAKE_BASE}/bronze/ref"

print(f"Monthly data path: {MONTHLY_PATH}")
print(f"Reference data path: {REF_PATH}")

## Helper Functions

In [None]:
def read_csv_discovery(file_path, file_description):
    """
    Read CSV file without schema for column discovery.
    
    Args:
        file_path: Full path to CSV file (including .csv.gz extension)
        file_description: Human-readable description
    
    Returns:
        DataFrame or None if file not found
    """
    print(f"\n{'='*80}")
    print(f"Reading: {file_description}")
    print(f"Path: {file_path}")
    print(f"{'='*80}")
    
    try:
        # Read CSV with automatic type inference
        df = spark.read.csv(
            file_path,
            sep='|',
            header=True,
            inferSchema=True,  # Let Spark infer types
            encoding="LATIN9"
        )

        # Display summary
        record_count = df.count()
        column_count = len(df.columns)
        
        print(f"\n✓ Successfully read file")
        print(f"  Records: {record_count:,}")
        print(f"  Columns: {column_count}")
        
        # Display schema with types
        # print(f"\n  Schema (column types):")
        # df.printSchema()
        
        # Display sample data (first 3 rows, first 10 columns)
        print(f"\n  Sample Data (first 3 rows):")
        df.select(df.columns[:min(10, len(df.columns))]).show(3, truncate=True)
        
        return df
        
    except Exception as e:
        print(f"\n✗ Error reading file: {e}")
        return None


def compare_with_sas_requirements(df, file_key, sas_columns):
    """
    Compare actual CSV columns with SAS requirements.
    Show real DataFrame column names for present ones.
    """
    if df is None:
        return
    
    # Build maps: lowercase → real name
    actual_map = {c.lower(): c for c in df.columns}
    sas_set = set([c.lower() for c in sas_columns])
    actual_set = set(actual_map.keys())
    
    missing_in_csv = sas_set - actual_set
    extra_in_csv = actual_set - sas_set
    present = actual_set & sas_set
    
    print(f"\n{'─'*80}")
    print(f"SAS REQUIREMENTS COMPARISON for {file_key}")
    print(f"{'─'*80}")
    
    print(f"✓ Present in CSV (needed by SAS): {len(present)}")
    if present:
        real_names = [actual_map[key] for key in sorted(present)]
        print(f"  {', '.join(real_names)}")
    
    if missing_in_csv:
        print(f"\n✗ Missing in CSV (needed by SAS): {len(missing_in_csv)}")
        print(f"  {', '.join(sorted(missing_in_csv))}")
    
    print(f"\n⚠ Extra in CSV (not needed by SAS): {len(extra_in_csv)}")
    if len(extra_in_csv) > 20:
        extras = [actual_map[key] for key in sorted(extra_in_csv)]
        print(f"  {', '.join(extras[:20])}...")
        print(f"  (and {len(extra_in_csv) - 20} more)")
    else:
        extras = [actual_map[key] for key in sorted(extra_in_csv)]
        print(f"  {', '.join(extras)}" if extras else "  (none)")
    
    print(f"\n→ RECOMMENDATION: Define schema with {len(present)} columns needed by SAS")
    print(f"{'─'*80}")

print("✓ Helper functions defined")

---
# PART 1: MONTHLY DATA (IMS)
---

## 1.1 IPF_AZ - Agent & Courtage Portfolio (IPFE16/IPFE36)

In [None]:
# Verified columns from columns_per_datas.md
sas_columns_ipf = [
    "ACTPRIN", "CDACTPRO", "CDCASRES", "CDCIEORI", "CDFRACT", "CDGECENT", "CDGREV", "CDMOTRES", "CDNAF", "CDNATP",
    "CDPOLQPL", "CDPOLRVI", "CDPROD", "CDPRVB1", "CDPRVB10", "CDPRVB11", "CDPRVB12", "CDPRVB13", "CDPRVB14",
    "CDPRVB2", "CDPRVB3", "CDPRVB4", "CDPRVB5", "CDPRVB6", "CDPRVB7", "CDPRVB8", "CDPRVB9", "CDREG", "CDRI",
    "CDSITMGR", "CDSITP", "CDTPCOA", "CDTRE", "CDTYPLI1", "CDTYPLI2", "CDTYPLI3", "CEDIACTA", "CMARCH", "CSEGT",
    "CSSEGT", "CTDEFTRA", "CTPRVTRV", "DSTCSC", "DTCREPOL", "DTECHANN", "DTEFFAN", "DTEFSITT", "DTOUCHAN",
    "DTRCPPR", "DTRCPRE", "DTRECTRX", "DTRESILP", "DTTRAAN", "DTTRAAR", "DTTYPLI1", "DTTYPLI2", "DTTYPLI3",
    "FNCMACA", "LBCAPI1", "LBCAPI10", "LBCAPI11", "LBCAPI12", "LBCAPI13", "LBCAPI14", "LBCAPI2", "LBCAPI3",
    "LBCAPI4", "LBCAPI5", "LBCAPI6", "LBCAPI7", "LBCAPI8", "LBCAPI9", "LBNATTRV", "LBQLTSOU", "LIDIRISQ",
    "MTCAF", "MTCAPI1", "MTCAPI10", "MTCAPI11", "MTCAPI12", "MTCAPI13", "MTCAPI14", "MTCAPI2", "MTCAPI3",
    "MTCAPI4", "MTCAPI5", "MTCAPI6", "MTCAPI7", "MTCAPI8", "MTCAPI9", "MTPRPRTO", "MTSMPR", "NMACTA", "NMCLT",
    "NMRISQ", "NMSRISQ", "NOCLT", "NOINT", "NOPOL", "NOPOLLI1", "OPAPOFFR", "POSACTA", "POSRISQ", "PRCDCIE",
    "PRPRVC1", "PRPRVC10", "PRPRVC11", "PRPRVC12", "PRPRVC13", "PRPRVC14", "PRPRVC2", "PRPRVC3", "PRPRVC4",
    "PRPRVC5", "PRPRVC6", "PRPRVC7", "PRPRVC8", "PRPRVC9", "PTGST", "QUARISQ", "RESRISQ", "RUEACTA", "RUERISQ",
    "TXCEDE", "tydris1", "VILRISQ"
]

df_ipfe16 = read_csv_discovery(
    f"{MONTHLY_PATH}/IMS_INFP_IIA0P6_IPFE16_IPF_{VISION}*.csv.gz",
    "IPFE16 - Agent Portfolio (PTF16)"
)
compare_with_sas_requirements(df_ipfe16, "IPFE16", sas_columns_ipf)

In [None]:
df_ipfe36 = read_csv_discovery(
    f"{MONTHLY_PATH}/IMS_INFP_IIA0P6_IPFE36_IPF_{VISION}*.csv.gz",
    "IPFE36 - Courtage Portfolio (PTF36)"
)
compare_with_sas_requirements(df_ipfe36, "IPFE36", sas_columns_ipf)

## 1.2 IPFM99 - Product 01099 CA Amounts

In [None]:
sas_columns_ipfm99 = [
    "CDACPR1", "CDACPR2", "CDPROD", 
    "MTCA", "MTCAENP", "MTCASST", "MTCAVNT", 
    "NOINT", "NOPOL"
]

df_ipfm99_agt = read_csv_discovery(
    f"{MONTHLY_PATH}/IMS_INFP_IIA0P6_3SPEIPFM99_IPF_{VISION}*.csv.gz",
    "3SPEIPFM99 - Agent Product 01099 CA"
)
compare_with_sas_requirements(df_ipfm99_agt, "3SPEIPFM99", sas_columns_ipfm99)

In [None]:
df_ipfm99_crt = read_csv_discovery(
    f"{MONTHLY_PATH}/IMS_INFP_IIA0P6_E1SPEIPFM99_IPF_{VISION}*.csv.gz",
    "E1SPEIPFM99 - Courtage Product 01099 CA"
)
compare_with_sas_requirements(df_ipfm99_crt, "E1SPEIPFM99", sas_columns_ipfm99)

## 1.3 IRD Risk Files (Q45, Q46, QAN)

In [None]:
sas_columns_ird_q45 = [
    "CTDEFTRA", "CTPRVTRV", "DTOUCHAN", 
    "DTRECTRX", "DTREFFIN", "LBDSTCSC", 
    "LBNATTRV", "NOPOL"
]

df_q45 = read_csv_discovery(
    f"{MONTHLY_PATH}/ird_risk_q45_{VISION}.csv",
    "IRD Risk Q45"
)
compare_with_sas_requirements(df_q45, "IRD_Q45", sas_columns_ird_q45)

In [None]:
sas_columns_ird_q46 = [
    "CTDEFTRA", "CTPRVTRV", "DTOUCHAN",
    "DTRECTRX", "DTREFFIN", "LBDSTCSC",
    "LBNATTRV", "NOPOL"
]

df_q46 = read_csv_discovery(
    f"{MONTHLY_PATH}/ird_risk_q46_{VISION}.csv",
    "IRD Risk Q46"
)
compare_with_sas_requirements(df_q46, "IRD_Q46", sas_columns_ird_q46)

In [None]:
sas_columns_ird_qan = [
    "CTDEFTRA", "CTPRVTRV", "DTOUCHAN",
    "LBNATTRV", "NOPOL"
]

df_qan = read_csv_discovery(
    f"{MONTHLY_PATH}/ird_risk_qan_{VISION}.csv",
    "IRD Risk QAN (legacy)"
)
compare_with_sas_requirements(df_qan, "IRD_QAN", sas_columns_ird_qan)

## 1.4 NAF 2008 / IRD Suivi Engagements

In [None]:
sas_columns_naf2008 = ["NOPOL", "CDPROD", "CDNAF08", "CDISIC"]

df_naf2008 = read_csv_discovery(
    f"{MONTHLY_PATH}/ird_suivi_engagements_{VISION}.csv",
    "IRD_SUIVI_ENGAGEMENTS (NAF 2008)"
)
compare_with_sas_requirements(df_naf2008, "IRD_SUIVI_ENGAGEMENTS", sas_columns_naf2008)

---
# PART 2: AZEC MONTHLY DATA
---

## 2.1 POLIC_CU - AZEC Policy Data

In [None]:
sas_columns_polic_cu = [
    "POLICE", "DATFIN", "DATRESIL", "INTERMED", "POINGEST", "CODECOAS", "DATAFN", "ETATPOL",
    "DUREE", "FINPOL", "DATTERME", "DATEXPIR", "PRODUIT", "EFFETPOL", "PRIME", "PARTBRUT",
    "CPCUA", "NOMCLI", "MOTIFRES", "ORIGRES", "TYPCONTR", "RMPLCANT", "GESTSIT",
    "ECHEANMM", "ECHEANJJ", "INDREGUL", "CDNAF", "CDNAF03_CLI", "CDNAF08_W6"
]

df_polic_cu = read_csv_discovery(
    f"{MONTHLY_PATH}/POLIC_CU.csv",
    "POLIC_CU - AZEC Policy Data (Monthly)"
)
if df_polic_cu is None:
    df_polic_cu = read_csv_discovery(
        f"{REF_PATH}/polic_cu.csv",
        "POLIC_CU - AZEC Policy Data (Ref)"
    )
compare_with_sas_requirements(df_polic_cu, "POLIC_CU", sas_columns_polic_cu)

## 2.2 CAPITXCU - AZEC Capital Data

In [None]:
sas_columns_capitxcu = ["POLICE", "PRODUIT", "SMP_SRE", "BRCH_REA", "CAPX_100", "CAPX_CUA"]

df_capitxcu = read_csv_discovery(
    f"{MONTHLY_PATH}/CAPITXCU.csv",
    "CAPITXCU - AZEC Capital Data (Monthly)"
)
if df_capitxcu is None:
    df_capitxcu = read_csv_discovery(
        f"{REF_PATH}/capitxcu.csv",
        "CAPITXCU - AZEC Capital Data (Ref)"
    )
compare_with_sas_requirements(df_capitxcu, "CAPITXCU", sas_columns_capitxcu)

---
# PART 3: REFERENCE DATA
---

## 3.1 PRDPFA1 / PRDPFA3 - Product Segmentation

In [None]:
sas_columns_prdpfa = [
    "CMARCH", "CPROD", "CSEG", "CSSSEG",
    "LMARCH", "LPROD", "LSEG", "LSSSEG"
]

df_prdpfa1 = read_csv_discovery(f"{REF_PATH}/prdpfa1.csv", "PRDPFA1 - Product Segmentation (Agent)")
compare_with_sas_requirements(df_prdpfa1, "PRDPFA1", sas_columns_prdpfa)

In [None]:
df_prdpfa3 = read_csv_discovery(f"{REF_PATH}/prdpfa3.csv", "PRDPFA3 - Product Segmentation (Courtage)")
compare_with_sas_requirements(df_prdpfa3, "PRDPFA3", sas_columns_prdpfa)

## 3.2 PRDCAP - Product Catalog

In [None]:
sas_columns_prdcap = ["CDPROD", "LBTPROD"]

df_prdcap = read_csv_discovery(f"{REF_PATH}/prdcap.csv", "PRDCAP - Product Catalog")
compare_with_sas_requirements(df_prdcap, "PRDCAP", sas_columns_prdcap)

## 3.3 PTGST - Management Points

In [None]:
sas_columns_ptgst = ["P_Num", "PTGST", "REGION"]

df_ptgst = read_csv_discovery(f"{REF_PATH}/ptgst.csv", "PTGST - Management Points Reference")
compare_with_sas_requirements(df_ptgst, "PTGST", sas_columns_ptgst)

In [None]:
sas_columns_ptgst_v = ["PTGST", "Upper_Mid"]

df_ptgst_v = read_csv_discovery(f"{REF_PATH}/ptgst_202501.csv", "PTGST_202501 - Management Points (Versioned)")
compare_with_sas_requirements(df_ptgst_v, "PTGST_202501", sas_columns_ptgst_v)

## 3.4 INCENDCU - Fire Insurance / PE-RD Data

In [None]:
sas_columns_incendcu = [
    "COD_NAF", "COD_TRE", 
    "MT_BASDI", "MT_BASPE", 
    "POLICE", "PRODUIT"
]

df_incendcu = read_csv_discovery(f"{REF_PATH}/incendcu.csv", "INCENDCU - Fire Insurance / NAF / PE-RD Data")
compare_with_sas_requirements(df_incendcu, "INCENDCU", sas_columns_incendcu)

## 3.5 CONSTRU / CONSTRCU - Construction Site Data

In [None]:
sas_columns_constrcu = [
    "DATFINCH", "DATOUVCH", "DATRECEP", "DEST_LOC", "FORMULE",
    "LDESTLOC", "LQUALITE", "LTYPMAR1", "MNT_GLOB", "NAT_CNT",
    "POLICE", "PRODUIT", "TYPMARC1"
]

df_constru = read_csv_discovery(f"{REF_PATH}/constru.csv", "CONSTRU - Construction Site / Product Formulas")
compare_with_sas_requirements(df_constru, "CONSTRU", sas_columns_constrcu)

## 3.6 LOB - Line of Business

In [None]:
sas_columns_lob = [
    "CDPROD", "CMARCH", "CPROD", "CSEG", "CSSSEG",
    "LMARCH", "LMARCH2", "LPROD", "LSEG", "LSEG2",
    "LSSSEG", "LSSSEG2", "PRODUIT", "SEGMENT"
]

df_lob = read_csv_discovery(f"{REF_PATH}/lob.csv", "LOB - Line of Business / Product Classification")
compare_with_sas_requirements(df_lob, "LOB", sas_columns_lob)

## 3.7 CPRODUIT - Product Reference

In [None]:
sas_columns_cproduit = ["cprod", "segment", "Segment_3", "Type_Produit_2"]

df_cproduit = read_csv_discovery(f"{REF_PATH}/cproduit.csv", "CPRODUIT - Product Reference Data")
compare_with_sas_requirements(df_cproduit, "CPRODUIT", sas_columns_cproduit)

## 3.8 GARANTCU - Guarantee Data

In [None]:
sas_columns_garantcu = ["BRANCHE", "GARANTIE", "POLICE"]

df_garantcu = read_csv_discovery(f"{REF_PATH}/garantcu.csv", "GARANTCU - Guarantee Reference Data")
compare_with_sas_requirements(df_garantcu, "GARANTCU", sas_columns_garantcu)

## 3.9 CATMIN - Category Minimum

In [None]:
sas_columns_catmin = ["PRODUIT", "GARANTIE", "CATMIN5"]

df_catmin = read_csv_discovery(f"{REF_PATH}/import_catmin.csv", "import_catmin - Category Minimum Classifications")
compare_with_sas_requirements(df_catmin, "CATMIN", sas_columns_catmin)

---
# PART 4: AZEC LEGACY TABLES
---

In [None]:
# RCENTCU (RC Enterprise)
sas_columns_rcentcu = ["POLICE", "COD_NAF", "FORMULE", "FORMULE2", "FORMULE3", "FORMULE4"]

df_rcentcu = read_csv_discovery(f"{REF_PATH}/rcentcu.csv", "RCENTCU - RC Enterprise Data")
compare_with_sas_requirements(df_rcentcu, "RCENTCU", sas_columns_rcentcu)

In [None]:
# RISTECCU (Professional Risk)
sas_columns_risteccu = ["POLICE", "PRODUIT", "COD_NAF", "FORMULE", "FORMULE2", "FORMULE3", "FORMULE4"]

df_risteccu = read_csv_discovery(f"{REF_PATH}/risteccu.csv", "RISTECCU - Professional Risk Data")
compare_with_sas_requirements(df_risteccu, "RISTECCU", sas_columns_risteccu)

In [None]:
# MULPROCU (Multi-risk with Turnover)
sas_columns_mulprocu = ["POLICE", "CHIFFAFF"]

df_mulprocu = read_csv_discovery(f"{REF_PATH}/mulprocu.csv", "MULPROCU - Multi-risk Professional Data")
compare_with_sas_requirements(df_mulprocu, "MULPROCU", sas_columns_mulprocu)

In [None]:
# MPACU
sas_columns_mpacu = ["POLICE", "COD_NAF"]

df_mpacu = read_csv_discovery(f"{REF_PATH}/mpacu.csv", "MPACU - MPA Data")
compare_with_sas_requirements(df_mpacu, "MPACU", sas_columns_mpacu)

---
# PART 5: ISIC MAPPING TABLES
---

In [None]:
# MAPPING_CDNAF2003_ISIC
sas_columns_naf2003 = ["CDNAF_2003", "ISIC_CODE"]

df_naf2003 = read_csv_discovery(f"{REF_PATH}/mapping_cdnaf2003_isic.csv", "MAPPING_CDNAF2003_ISIC")
compare_with_sas_requirements(df_naf2003, "MAPPING_CDNAF2003_ISIC", sas_columns_naf2003)

In [None]:
# MAPPING_CDNAF2008_ISIC
sas_columns_naf2008_map = ["CDNAF_2008", "ISIC_CODE"]

df_naf2008_map = read_csv_discovery(f"{REF_PATH}/mapping_cdnaf2008_isic.csv", "MAPPING_CDNAF2008_ISIC")
compare_with_sas_requirements(df_naf2008_map, "MAPPING_CDNAF2008_ISIC", sas_columns_naf2008_map)

In [None]:
# MAPPING_ISIC_CONST_ACT
sas_columns_isic_act = ["ACTPRIN", "CDNAF08", "CDTRE", "CDNAF03", "CDISIC"]

df_isic_act = read_csv_discovery(f"{REF_PATH}/mapping_isic_const_act.csv", "MAPPING_ISIC_CONST_ACT")
compare_with_sas_requirements(df_isic_act, "MAPPING_ISIC_CONST_ACT", sas_columns_isic_act)

In [None]:
# MAPPING_ISIC_CONST_CHT
sas_columns_isic_cht = ["DESTI_ISIC", "CDNAF08", "CDTRE", "CDNAF03", "CDISIC"]

df_isic_cht = read_csv_discovery(f"{REF_PATH}/mapping_isic_const_cht.csv", "MAPPING_ISIC_CONST_CHT")
compare_with_sas_requirements(df_isic_cht, "MAPPING_ISIC_CONST_CHT", sas_columns_isic_cht)

In [None]:
# TABLE_ISIC_TRE_NAF (Hazard grades)
sas_columns_isic_tre = ["ISIC_CODE", "HAZARD_GRADES_FIRE", "HAZARD_GRADES_BI", "HAZARD_GRADES_RCA", "HAZARD_GRADES_RCE", "HAZARD_GRADES_TRC", "HAZARD_GRADES_RCD", "HAZARD_GRADES_DO"]

df_isic_tre = read_csv_discovery(f"{REF_PATH}/table_isic_tre_naf.csv", "TABLE_ISIC_TRE_NAF")
compare_with_sas_requirements(df_isic_tre, "TABLE_ISIC_TRE_NAF", sas_columns_isic_tre)

In [None]:
# ISIC_LG (Local to Global)
sas_columns_isic_lg = ["ISIC_Local", "ISIC_Global"]

df_isic_lg = read_csv_discovery(f"{REF_PATH}/isic_lg.csv", "ISIC_LG - Local to Global Mapping")
compare_with_sas_requirements(df_isic_lg, "ISIC_LG", sas_columns_isic_lg)

---
# PART 6: CLIENT DATA & OTHER
---

In [None]:
# CLACENT1/3 (Client Data)
sas_columns_clacent = ["NOCLT", "CDSIRET", "CDSIREN", "CDNAF"]

df_clacent1 = read_csv_discovery(f"{REF_PATH}/clacent1.csv", "CLACENT1 - Client Data Agent")
compare_with_sas_requirements(df_clacent1, "CLACENT1", sas_columns_clacent)

In [None]:
df_clacent3 = read_csv_discovery(f"{REF_PATH}/clacent3.csv", "CLACENT3 - Client Data Courtage")
compare_with_sas_requirements(df_clacent3, "CLACENT3", sas_columns_clacent)

In [None]:
# BASECLI_INV (W6 Client Base)
sas_columns_basecli = ["NOCLT", "CDAPET"]

df_basecli = read_csv_discovery(f"{REF_PATH}/basecli_inv.csv", "BASECLI_INV - W6 Client Base")
compare_with_sas_requirements(df_basecli, "BASECLI_INV", sas_columns_basecli)

In [None]:
# HISTO_NOTE_RISQUE (Euler Risk Rating)
sas_columns_histo = ["CDSIREN", "CDNOTE", "DTDEB_VALID", "DTFIN_VALID"]

df_histo = read_csv_discovery(f"{REF_PATH}/histo_note_risque.csv", "HISTO_NOTE_RISQUE - Euler Risk Rating")
compare_with_sas_requirements(df_histo, "HISTO_NOTE_RISQUE", sas_columns_histo)

In [None]:
# DO_DEST (Construction Site Destination)
sas_columns_do_dest = ["NOPOL", "DESTINAT"]

df_do_dest = read_csv_discovery(f"{REF_PATH}/do_dest.csv", "DO_DEST - Construction Site Destination")
compare_with_sas_requirements(df_do_dest, "DO_DEST", sas_columns_do_dest)

In [None]:
# TYPRD_2 (Activity to Product Type)
sas_columns_typrd = ["ACTIVITE", "TYPE_PRODUIT"]

df_typrd = read_csv_discovery(f"{REF_PATH}/typrd_2.csv", "TYPRD_2 - Activity to Product Type Mapping")
compare_with_sas_requirements(df_typrd, "TYPRD_2", sas_columns_typrd)

In [None]:
# REF_MIG_AZEC (AZEC to IMS migration)
sas_columns_mig = ["NOPOL_AZEC"]

df_mig = read_csv_discovery(f"{REF_PATH}/ref_mig_azec_vs_ims.csv", "REF_MIG_AZEC_VS_IMS - AZEC to IMS Migration")
compare_with_sas_requirements(df_mig, "REF_MIG_AZEC_VS_IMS", sas_columns_mig)

---
# PART 7: SPECIAL PRODUCT DATA (IPFSPE)
---

In [None]:
# IPFM0024 (Product Activity)
sas_columns_ipfm0024 = ["NOPOL", "NOINT", "CDPROD", "CDACTPRF01", "CDACTPRF02"]

df_ipfm0024 = read_csv_discovery(f"{MONTHLY_PATH}/IMS_*_IPFM0024_*_{VISION}*.csv.gz", "IPFM0024 - Product Activity (Monthly)")
if df_ipfm0024 is None:
    df_ipfm0024 = read_csv_discovery(f"{REF_PATH}/ipfm0024.csv", "IPFM0024 - Product Activity (Ref)")
compare_with_sas_requirements(df_ipfm0024, "IPFM0024", sas_columns_ipfm0024)

In [None]:
# IPFM63 (Professional Activities)
sas_columns_ipfm63 = ["NOPOL", "NOINT", "CDPROD", "ACTPRIN", "ACTSEC1", "CDNAF", "MTCA1"]

df_ipfm63 = read_csv_discovery(f"{MONTHLY_PATH}/IMS_*_IPFM63_*_{VISION}*.csv.gz", "IPFM63 - Professional Activities (Monthly)")
if df_ipfm63 is None:
    df_ipfm63 = read_csv_discovery(f"{REF_PATH}/ipfm63.csv", "IPFM63 - Professional Activities (Ref)")
compare_with_sas_requirements(df_ipfm63, "IPFM63", sas_columns_ipfm63)

---
# SUMMARY & RECOMMENDATIONS
---

In [None]:
print("="*80)
print("COLUMN DISCOVERY SUMMARY")
print("="*80)

print("\n✓ Files Successfully Read:")
files_read = [
    ("IPFE16", df_ipfe16),
    ("IPFE36", df_ipfe36),
    ("3SPEIPFM99", df_ipfm99_agt),
    ("E1SPEIPFM99", df_ipfm99_crt),
    ("IRD_Q45", df_q45),
    ("IRD_Q46", df_q46),
    ("IRD_QAN", df_qan),
    ("IRD_SUIVI_ENGAGEMENTS", df_naf2008),
    ("POLIC_CU", df_polic_cu),
    ("CAPITXCU", df_capitxcu),
    ("PRDPFA1", df_prdpfa1),
    ("PRDPFA3", df_prdpfa3),
    ("PRDCAP", df_prdcap),
    ("PTGST", df_ptgst),
    ("PTGST_202501", df_ptgst_v),
    ("INCENDCU", df_incendcu),
    ("CONSTRU", df_constru),
    ("LOB", df_lob),
    ("CPRODUIT", df_cproduit),
    ("GARANTCU", df_garantcu),
    ("CATMIN", df_catmin),
    ("RCENTCU", df_rcentcu),
    ("RISTECCU", df_risteccu),
    ("MULPROCU", df_mulprocu),
    ("MPACU", df_mpacu),
    ("MAPPING_CDNAF2003_ISIC", df_naf2003),
    ("MAPPING_CDNAF2008_ISIC", df_naf2008_map),
    ("MAPPING_ISIC_CONST_ACT", df_isic_act),
    ("MAPPING_ISIC_CONST_CHT", df_isic_cht),
    ("TABLE_ISIC_TRE_NAF", df_isic_tre),
    ("ISIC_LG", df_isic_lg),
    ("CLACENT1", df_clacent1),
    ("CLACENT3", df_clacent3),
    ("BASECLI_INV", df_basecli),
    ("HISTO_NOTE_RISQUE", df_histo),
    ("DO_DEST", df_do_dest),
    ("TYPRD_2", df_typrd),
    ("REF_MIG_AZEC_VS_IMS", df_mig),
    ("IPFM0024", df_ipfm0024),
    ("IPFM63", df_ipfm63)
]

found = []
not_found = []

for name, df in files_read:
    if df is not None:
        found.append((name, len(df.columns), df.count()))
    else:
        not_found.append(name)

print(f"\n{'Table':<30} {'Cols':>6} {'Records':>12}")
print("-"*50)
for name, cols, rows in found:
    print(f"{name:<30} {cols:>6} {rows:>12,}")

if not_found:
    print(f"\n✗ NOT FOUND ({len(not_found)} tables):")
    for name in not_found:
        print(f"  - {name}")

print(f"\n" + "="*80)
print(f"TOTAL: {len(found)}/{len(files_read)} tables available in bronze")
print("="*80)

In [None]:
# Optional: Export column lists to CSV for offline analysis
import pandas as pd
from datetime import datetime

output_path = f"column_discovery_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"

column_summary = []
for name, df in files_read:
    if df is not None:
        for col in df.columns:
            column_summary.append({
                "file_name": name,
                "column_name": col,
                "data_type": str(dict(df.dtypes)[col])
            })

if column_summary:
    df_summary = pd.DataFrame(column_summary)
    df_summary.to_csv(output_path, index=False)
    print(f"\n✓ Column summary exported to: {output_path}")
    print(f"  Total columns documented: {len(column_summary)}")
else:
    print("\n⚠️  No data to export")