## 01. Library import

**Purpose**: Import core analytics/visualization stack and record environment versions for reproducibility.

In [2]:
import os
import sys
import json
import time
from pathlib import Path
from typing import Dict, Any, List


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


print("Python:", sys.version)
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("matplotlib:", plt.matplotlib.__version__)
print("seaborn:", sns.__version__)


plt.rcParams.update({
"figure.figsize": (8, 4),
"axes.grid": True,
"axes.titlesize": 12,
})

Python: 3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]
pandas: 2.2.2
numpy: 1.26.4
matplotlib: 3.10.5
seaborn: 0.13.2


## 02. Project paths

**Purpose**: Define project folders and ensure directory structure exists (artifacts/, reports/, src/).

In [3]:
PROJECT_DIR = Path(".")
ARTIFACTS_DIR = PROJECT_DIR / "../artifacts"
REPORTS_DIR = PROJECT_DIR / "../reports"
REPORTS_FIGS = REPORTS_DIR / "../figs"
SRC_DIR = PROJECT_DIR / "../src"
DATA_DIR = PROJECT_DIR / "../data/train-data"
TRAIN_TARGET = DATA_DIR / "train_target.csv"


for p in [ARTIFACTS_DIR, REPORTS_DIR, REPORTS_FIGS, SRC_DIR]:
    p.mkdir(parents=True, exist_ok=True)


print("Project folders ready:", ARTIFACTS_DIR, REPORTS_DIR, REPORTS_FIGS, SRC_DIR)

Project folders ready: ..\artifacts ..\reports ..\reports\..\figs ..\src


## 03. Trial dataset detection

**Purpose**: Locate a trial/curated parquet produced in 01 and select the first matching candidate.

In [4]:
TRIAL_PARQUET_CANDIDATES = list(ARTIFACTS_DIR.glob("*trial*.parquet")) + \
                            list(ARTIFACTS_DIR.glob("*curated*.parquet"))
if not TRIAL_PARQUET_CANDIDATES:
    raise FileNotFoundError(
        "No trial parquet found in artifacts/. Expected file like '*trial*.parquet' or '*curated*.parquet'."
    )
TRIAL_PARQUET_PATH = TRIAL_PARQUET_CANDIDATES[0]
print(f"Using trial file: {TRIAL_PARQUET_PATH}")

Using trial file: ..\artifacts\features_trial.parquet


## 04. Import safe parquet reader

**Purpose**: Import read_parquet_safe from src/utils if available; otherwise provide a simple fallback.

In [5]:
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

try:
    from utils import read_parquet_safe as READ_SAFE
    print("Imported utils.read_parquet_safe")
except Exception as e:
    print("Warning: cannot import utils.read_parquet_safe:", e)
    import pandas as pd
    def READ_SAFE(path, columns=None, sample_n=None, prefer_arrow_dtypes=True):
        df = pd.read_parquet(path, columns=columns)
        return df.head(sample_n) if sample_n else df

Imported utils.read_parquet_safe


## 06. Load trial

**Purpose**: Read trial dataframe and show basic shape; start timer for the rest of the pipeline.

In [6]:
start_time = time.time()
df_trial = READ_SAFE(TRIAL_PARQUET_PATH)
print("Trial shape:", df_trial.shape)

Trial shape: (1000, 40)


## 07. Generate features

**Purpose**: Use FeatureGenerator to create new features on the trial sample.

In [7]:
from utils.features import FeatureGenerator, FeatureConfig

fg = FeatureGenerator(FeatureConfig())
df_feat = fg.transform(df_trial, df_trial)
print("Features shape:", df_feat.shape)

Features shape: (1000, 40)


## 08. Merge labels for trial

**Purpose**: Attach labels from data/train_target.csv for EDA (correlations). Standardize to 'target'.

In [8]:
if TRAIN_TARGET.exists():
    y = pd.read_csv(TRAIN_TARGET)
    assert "id" in y.columns, "train_target.csv must have 'id'"
    # handle 'flag' vs 'target'
    if "target" not in y.columns:
        assert "flag" in y.columns, "train_target.csv must have either 'target' or 'flag'"
        y = y.rename(columns={"flag": "target"})
    # normalize dtypes and uniqueness
    y["id"] = y["id"].astype("string")
    y = y.drop_duplicates("id").set_index("id")
    y["target"] = y["target"].astype("int8")

    # merge into df_feat (keep all rows; labels may be missing)
    df_feat = df_feat.copy()
    df_feat["id"] = df_feat["id"].astype("string")
    before = len(df_feat)
    df_feat = df_feat.join(y, on="id", how="left")
    matched = int(df_feat["target"].notna().sum())
    print(f"Labels attached: {matched}/{before} rows")

    # optional: save labeled trial for convenience
    LABELED_TRIAL = ARTIFACTS_DIR / "features_trial_labeled.parquet"
    df_feat.to_parquet(LABELED_TRIAL, index=False)
    print("Saved:", LABELED_TRIAL)
else:
    print("train_target.csv not found → skipping trial merge (correlation block will be skipped).")

Labels attached: 1000/1000 rows
Saved: ..\artifacts\features_trial_labeled.parquet


## 08. Report (JSON)

**Purpose**: Save a machine-readable report with the list of new features and their justifications.

In [10]:
just = fg.get_justification()
report: Dict[str, Any] = {
"n_input_cols": int(df_trial.shape[1]),
"n_output_cols": int(df_feat.shape[1]),
"n_new_features": int(len(just)),
"new_features": just,
}
REPORT_JSON_PATH = REPORTS_DIR / "02_feature_engineering_report.json"
with open(REPORT_JSON_PATH, "w", encoding="utf-8") as f:
	json.dump(report, f, ensure_ascii=False, indent=2)
print(f"Report saved → {REPORT_JSON_PATH}")

Report saved → ..\reports\02_feature_engineering_report.json


## 09. Plots: distributions

**Purpose**: Produce histograms for key new numeric features and save to reports/figs/.

In [11]:
key_num_cols = [c for c in [
"debt_to_limit", "overdue_to_limit", "maxoverdue_to_limit",
"loan_term_ratio", "since_ratio", "till_close_gap",
"total_delays", "serious_delay_ratio",
"paym_good_count", "paym_bad_count", "paym_last_status", "paym_last_clean_streak",
] if c in df_feat.columns]


for col in key_num_cols:
    plt.figure()
    sns.histplot(df_feat[col].dropna(), bins=50, kde=False)
    plt.title(f"Distribution: {col}")
    fig_path = REPORTS_FIGS / f"hist_{col}.png"
    plt.tight_layout(); plt.savefig(fig_path, dpi=120)
    plt.close()
    
print(f"Saved {len(key_num_cols)} histograms → {REPORTS_FIGS}")

Saved 2 histograms → ..\reports\..\figs


## 10. Plots: correlations

**Purpose**: If trial contains 'target', save a correlation heatmap for new features vs target.

In [12]:
TARGET_COL = "target"
if TARGET_COL in df_feat.columns:
    corr_cols = [c for c in key_num_cols + [TARGET_COL] if pd.api.types.is_numeric_dtype(df_feat[c])]
    corr = df_feat[corr_cols].corr(numeric_only=True)
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=False, cmap="viridis")
    plt.title("Correlation: new features vs target")
    fig_path = REPORTS_FIGS / "corr_new_features.png"
    plt.tight_layout(); plt.savefig(fig_path, dpi=120)
    plt.close()
    print("Correlation heatmap saved.")
else:
    print("'target' not found in trial → skipping correlation heatmap.")

Correlation heatmap saved.


## 11. Save artifacts

**Purspose**: Persist trial dataset with features to artifacts/ as parquet (+ optional CSV preview).

In [13]:
FEATURES_PARQUET = ARTIFACTS_DIR / "features_trial.parquet"
FEATURES_CSV = ARTIFACTS_DIR / "features_trial.csv"


df_feat.to_parquet(FEATURES_PARQUET, index=False)
print(f"Saved: {FEATURES_PARQUET}")


try:
    df_feat.head(100_000).to_csv(FEATURES_CSV, index=False)
    print(f"Saved CSV preview (≤100k rows): {FEATURES_CSV}")
except Exception as e:
    print("CSV save failed (non-critical):", e)

Saved: ..\artifacts\features_trial.parquet
Saved CSV preview (≤100k rows): ..\artifacts\features_trial.csv


## 12. Summary

**Purpose**: Print execution time and next-step guidance for notebook 03

In [14]:
elapsed = time.time() - start_time
print(f"Done in {elapsed:.1f} sec.")
print("Next: import the same FeatureGenerator in 03_final_dataset and stream over all parquet files.")

Done in 6.8 sec.
Next: import the same FeatureGenerator in 03_final_dataset and stream over all parquet files.
