
# Iteration 2 â€” 1.3.9 Feature Engineering Implementation
**Team 102D Â· AB Data Challenge**

This notebook covers:
- **1.3.9** Feature Engineering Implementation  
- **1.3.9.1** Definition & Computation of Candidate Features  
- **1.3.9.2** Evaluation of Temporal & Behavioural Patterns  
- **1.3.9.3** Documentation of Feature-Generation Process  

**Focus:** anomaly detection using Telelectura data only (per reading with per-meter temporal context).  
**Anomaly codes:** `32768` (faulty meter not yet replaced), `163840` (two consecutive 0-consumption periods).



## ðŸ”§ Required installations (run once if needed)
If you want to save outputs to Parquet, install one of these engines:
```bash
pip install -U pyarrow
# or
pip install -U fastparquet
```


In [None]:

# If you prefer to auto-install from the notebook, uncomment one of the lines below.
# !pip install -U pyarrow
# !pip install -U fastparquet
# !pip install -U pandas numpy
print("If Parquet save fails, install pyarrow or fastparquet (see the cell above).")


In [None]:

# === Setup & paths ===
import os
import json
import numpy as np
import pandas as pd

# Set this to your repo root if running inside your project:
# PROJECT_ROOT = r"C:\\Users\\jofre\\Documents\\GitHub\\AB_DataChallenge.Team102D"
# PROJECT_ROOT = "C:/Users/jofre/Documents/GitHub/AB_DataChallenge.Team102D"
PROJECT_ROOT = "."

DATA_DIR = os.path.join(PROJECT_ROOT, "data")
RESULTS_DIR = os.path.join(PROJECT_ROOT, "results", "iteration_2")
os.makedirs(RESULTS_DIR, exist_ok=True)

# Expected cleaned data from 1.3.8
CLEAN_CSV = os.path.join(RESULTS_DIR, "cleaned_dataset_v2.csv")
DERIVED_DIR = os.path.join(DATA_DIR, "derived")
FALLBACK_SAMPLE = os.path.join(DERIVED_DIR, "dataset_sample.csv")  # optional fallback

def safe_read_csv(path):
    if os.path.exists(path):
        return pd.read_csv(path, low_memory=False)
    return None

print("Looking for cleaned dataset:", CLEAN_CSV)
df = safe_read_csv(CLEAN_CSV)
if df is None:
    print("[warn] cleaned_dataset_v2.csv not found. Trying fallback:", FALLBACK_SAMPLE)
    df = safe_read_csv(FALLBACK_SAMPLE)

if df is None:
    raise SystemExit("No input data available. Run 1.3.8 first or place a CSV under results/iteration_2/cleaned_dataset_v2.csv")

print("Loaded df:", df.shape)
display(df.head(3))



## Canonical Columns
We standardize these names if present:
- `polissa_id` (`POLIZA_SUMINISTRO`)
- `num_serie_contador` (`NUMEROSERIECONTADOR`)
- `consumption` (`CONSUMO_REAL`)
- `datetime` (`FECHA_HORA` â†’ parsed to datetime)
- `codi_anomalia`
- `data_inici`, `data_fi` (period boundaries if applicable)
- `date`, `year`, `month`, `dayofweek`, `hour` (derived)


In [None]:

# Apply canonical mapping (adapted to your columns)
CANONICAL_MAP = {
    "POLIZA_SUMINISTRO": "polissa_id",
    "NUMEROSERIECONTADOR": "num_serie_contador",
    "CONSUMO_REAL": "consumption",
    "FECHA_HORA": "datetime",
    "CODI_ANOMALIA": "codi_anomalia",
    "DATA_INICI": "data_inici",
    "DATA_FI": "data_fi",
    "DATE": "date",
    "YEAR": "year",
}

# Normalize and map
new_cols = []
for c in df.columns:
    key = c.strip().upper()
    new_cols.append(CANONICAL_MAP.get(key, c))
df.columns = new_cols

# Parse time fields
if "datetime" in df.columns:
    df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.date
if "data_inici" in df.columns:
    df["data_inici"] = pd.to_datetime(df["data_inici"], errors="coerce")
if "data_fi" in df.columns:
    df["data_fi"] = pd.to_datetime(df["data_fi"], errors="coerce")

# Temporal context
if "datetime" in df.columns:
    df["year"] = df["datetime"].dt.year
    df["month"] = df["datetime"].dt.month
    df["dayofweek"] = df["datetime"].dt.dayofweek
    df["hour"] = df["datetime"].dt.hour
elif "date" in df.columns:
    dt = pd.to_datetime(df["date"], errors="coerce")
    df["year"] = dt.dt.year
    df["month"] = dt.dt.month
    df["dayofweek"] = dt.dt.dayofweek

display(df.head(3))



## Labels & Flags
- **Label** `y_anom = 1` if `codi_anomalia âˆˆ {32768, 163840}`, else 0  
- **Flags**: zero/negative consumption (non-destructive)


In [None]:

def make_labels_and_flags(dfin: pd.DataFrame) -> pd.DataFrame:
    import numpy as np
    dfout = dfin.copy()
    # Label
    if "codi_anomalia" in dfout.columns:
        dfout["y_anom"] = dfout["codi_anomalia"].isin([32768, 163840]).astype(int)
    else:
        dfout["y_anom"] = np.nan  # label unavailable
    
    # Flags
    if "consumption" in dfout.columns:
        dfout["flag_zero_consumption"] = (pd.to_numeric(dfout["consumption"], errors="coerce") == 0).astype(float)
        dfout["flag_negative_consumption"] = (pd.to_numeric(dfout["consumption"], errors="coerce") < 0).astype(float)
    else:
        dfout["flag_zero_consumption"] = np.nan
        dfout["flag_negative_consumption"] = np.nan
    return dfout

df = make_labels_and_flags(df)
display(df[["codi_anomalia","y_anom","flag_zero_consumption","flag_negative_consumption"]].head(5))



## Feature Set (per reading, per meter context)
We compute:
- **Lags & deltas**: `cons_lag1`, `cons_lag2`, `delta1`, `delta2`
- **Rolling stats** on last N readings: mean/std/min/max and zero/neg ratios (N âˆˆ {3, 12, 24})
- **Meter-level z-score** vs. historical mean/std
- **Period duration** in hours (if `data_inici` & `data_fi` exist)


In [None]:

ROLL_WINDOWS = [3, 12, 24]  # last-N readings

def _group_sort(dfin: pd.DataFrame):
    key = "num_serie_contador" if "num_serie_contador" in dfin.columns else (
          "polissa_id" if "polissa_id" in dfin.columns else None)
    if key is None:
        raise ValueError("No meter/policy identifier found. Expect one of ['num_serie_contador','polissa_id'].")
    if "datetime" in dfin.columns:
        dfin = dfin.sort_values([key, "datetime"])
    elif "date" in dfin.columns:
        dfin = dfin.sort_values([key, "date"])
    else:
        dfin = dfin.sort_values([key]).copy()
    return dfin, key

def make_features(dfin: pd.DataFrame) -> pd.DataFrame:
    dfout = dfin.copy()
    dfout, key = _group_sort(dfout)

    if "consumption" not in dfout.columns:
        raise ValueError("Missing 'consumption'. Map CONSUMO_REAL -> consumption first.")
    dfout["consumption"] = pd.to_numeric(dfout["consumption"], errors="coerce")

    # Lags/Deltas
    dfout["cons_lag1"] = dfout.groupby(key)["consumption"].shift(1)
    dfout["cons_lag2"] = dfout.groupby(key)["consumption"].shift(2)
    dfout["delta1"] = dfout["consumption"] - dfout["cons_lag1"]
    dfout["delta2"] = dfout["consumption"] - dfout["cons_lag2"]

    # Rolling stats
    for N in ROLL_WINDOWS:
        grp = dfout.groupby(key)["consumption"]
        dfout[f"roll{N}_mean"] = grp.shift(1).rolling(N, min_periods=1).mean()
        dfout[f"roll{N}_std"]  = grp.shift(1).rolling(N, min_periods=2).std()
        dfout[f"roll{N}_min"]  = grp.shift(1).rolling(N, min_periods=1).min()
        dfout[f"roll{N}_max"]  = grp.shift(1).rolling(N, min_periods=1).max()

        zeros = (grp.shift(1) == 0).astype(float)
        negs  = (grp.shift(1) < 0).astype(float)
        dfout[f"roll{N}_zero_ratio"] = zeros.groupby(dfout[key]).rolling(N, min_periods=1).mean().reset_index(level=0, drop=True)
        dfout[f"roll{N}_neg_ratio"]  = negs.groupby(dfout[key]).rolling(N, min_periods=1).mean().reset_index(level=0, drop=True)

    # Meter-level normalization
    meter_stats = dfout.groupby(key)["consumption"].agg(meter_mean="mean", meter_std="std")
    dfout = dfout.merge(meter_stats, left_on=key, right_index=True, how="left")
    dfout["cons_z_meter"] = (dfout["consumption"] - dfout["meter_mean"]) / dfout["meter_std"].replace(0, np.nan)

    # Period duration
    if {"data_inici","data_fi"}.issubset(dfout.columns):
        di = pd.to_datetime(dfout["data_inici"], errors="coerce")
        df_ = pd.to_datetime(dfout["data_fi"], errors="coerce")
        dfout["period_hours"] = (df_ - di).dt.total_seconds() / 3600.0

    return dfout

feat_df = make_features(df)
display(feat_df.head(5))
print("Feature shape:", feat_df.shape)



## 1.3.9.2 â€” Quick Temporal/Behavioural Checks
Numeric summaries and a **robust** correlation with the label (skips low-N & zero-variance).


In [None]:

summaries = {}

# Robust correlation with label
label_col = "y_anom"
min_n = 50

def safe_corr(x: pd.Series, y: pd.Series, min_n: int = 50):
    s = pd.concat([x, y], axis=1).dropna()
    if len(s) < min_n:
        return np.nan
    if s.iloc[:, 0].std(ddof=1) == 0 or s.iloc[:, 1].std(ddof=1) == 0:
        return np.nan
    return s.iloc[:, 0].corr(s.iloc[:, 1])

corr_with_y = {}
if label_col in feat_df.columns:
    num_cols = [c for c in feat_df.columns if pd.api.types.is_numeric_dtype(feat_df[c])]
    for c in num_cols:
        if c == label_col:
            continue
        r = safe_corr(feat_df[c], feat_df[label_col], min_n=min_n)
        if not np.isnan(r):
            corr_with_y[c] = float(r)

if corr_with_y:
    corr_series = pd.Series(corr_with_y).sort_values(ascending=False)
    corr_series.head(25).to_csv(os.path.join(RESULTS_DIR, "fe_corr_with_label_top25.csv"))
    summaries["corr_top25_path"] = "fe_corr_with_label_top25.csv"
    print(f"[ok] Saved {len(corr_with_y)} valid correlations.")
else:
    print("[info] No valid correlations (likely zero variance or insufficient pairs).")

# Numeric describe
num_desc = feat_df.describe(include="number").T
num_desc.to_csv(os.path.join(RESULTS_DIR, "fe_numeric_describe.csv"))
summaries["numeric_describe_path"] = "fe_numeric_describe.csv"

# Year coverage
if "year" in feat_df.columns:
    cov_year = feat_df.groupby("year").size().reset_index(name="n_records")
    cov_year.to_csv(os.path.join(RESULTS_DIR, "fe_coverage_year.csv"), index=False)
    summaries["coverage_year_path"] = "fe_coverage_year.csv"

with open(os.path.join(RESULTS_DIR, "fe_quick_summaries.json"), "w") as f:
    json.dump(summaries, f, indent=2)

print("Wrote summaries:", summaries)



## 1.3.9.3 â€” Save Outputs & Document Features
CSV is always saved. Parquet is attempted (requires pyarrow/fastparquet). A feature dictionary is generated.


In [None]:

# Save features (CSV)
fe_csv = os.path.join(RESULTS_DIR, "features_v2.csv")
feat_df.to_csv(fe_csv, index=False)
print("[saved]", fe_csv)

# Safe Parquet write
try:
    import pandas as pd
    fe_parquet = os.path.join(RESULTS_DIR, "features_v2.parquet")
    feat_df.to_parquet(fe_parquet, index=False)
    print("[saved]", fe_parquet)
except Exception as e:
    print("[warn] Could not write parquet:", e)

# Feature dictionary
feature_dict = []

def add_desc(name, desc, dtype="numeric", grain="reading"):
    feature_dict.append({"name": name, "description": desc, "dtype": dtype, "grain": grain})

core_fields = ["polissa_id","num_serie_contador","datetime","date","year","month","dayofweek","hour",
               "codi_anomalia","data_inici","data_fi","period_hours","y_anom","consumption",
               "flag_zero_consumption","flag_negative_consumption",
               "cons_lag1","cons_lag2","delta1","delta2","meter_mean","meter_std","cons_z_meter"]
for base in core_fields:
    if base in feat_df.columns:
        add_desc(base, f"Core/derived field: {base}")

for N in [3,12,24]:
    for stat in ["mean","std","min","max","zero_ratio","neg_ratio"]:
        col = f"roll{N}_{stat}"
        if col in feat_df.columns:
            add_desc(col, f"Rolling {stat} over last {N} readings (excluding current).")

dict_path = os.path.join(RESULTS_DIR, "feature_dictionary_v2.json")
with open(dict_path, "w") as f:
    json.dump(feature_dict, f, indent=2, ensure_ascii=False)
print("[saved]", dict_path)



# Visual Analysis (Optional but Recommended)
Simple plots to aid **1.3.9.2 Evaluation** and reporting.  
Each chart uses Matplotlib (no seaborn) and appears in its own figure.


In [None]:

import matplotlib.pyplot as plt

if "consumption" in feat_df.columns:
    plt.figure()
    feat_df["consumption"].dropna().hist(bins=50)
    plt.title("Distribution of Consumption")
    plt.xlabel("Consumption")
    plt.ylabel("Count")
    plt.show()
else:
    print("[info] 'consumption' column not found; skipping histogram.")


In [None]:

if "year" in feat_df.columns:
    counts = feat_df["year"].value_counts().sort_index()
    plt.figure()
    counts.plot(kind="bar")
    plt.title("Records per Year")
    plt.xlabel("Year")
    plt.ylabel("Number of Records")
    plt.show()
else:
    print("[info] 'year' column not found; skipping records-per-year plot.")


In [None]:

id_col = "num_serie_contador" if "num_serie_contador" in feat_df.columns else ("polissa_id" if "polissa_id" in feat_df.columns else None)
if id_col is not None and "datetime" in feat_df.columns and "roll12_mean" in feat_df.columns:
    meter_id = feat_df[id_col].dropna().unique()
    if len(meter_id) > 0:
        meter_id = meter_id[0]
        sub = feat_df[feat_df[id_col] == meter_id].copy()
        sub = sub.sort_values("datetime")
        plt.figure()
        sub.set_index("datetime")[["consumption"]].plot(legend=True)
        plt.title(f"Consumption over time â€” {id_col}={meter_id}")
        plt.xlabel("Datetime")
        plt.ylabel("Consumption")
        plt.show()

        plt.figure()
        sub.set_index("datetime")[["roll12_mean"]].plot(legend=True)
        plt.title(f"Roll12 Mean over time â€” {id_col}={meter_id}")
        plt.xlabel("Datetime")
        plt.ylabel("Roll12 Mean")
        plt.show()
    else:
        print("[info] No meter ids available for time series plot.")
else:
    print("[info] Missing id/time/roll12_mean for time series plots; skipping.")


In [None]:

na_ratios = feat_df.isna().mean().sort_values(ascending=False).head(20)
plt.figure()
na_ratios.plot(kind="bar")
plt.title("Top-20 Features by Missingness Ratio")
plt.xlabel("Feature")
plt.ylabel("NA Ratio")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:

if "y_anom" in feat_df.columns:
    vc = feat_df["y_anom"].value_counts(dropna=False).sort_index()
    plt.figure()
    vc.plot(kind="bar")
    plt.title("Label Distribution: y_anom")
    plt.xlabel("y_anom")
    plt.ylabel("Count")
    plt.show()
else:
    print("[info] 'y_anom' not found; skipping label distribution plot.")



## Optional Diagnostics
Helps explain missing correlations (constant labels / zero-variance features).


In [None]:

# Label sanity
if "y_anom" in feat_df.columns:
    print("y_anom counts:", feat_df["y_anom"].value_counts(dropna=False).to_dict())

# Zero-variance numeric features
num_cols = [c for c in feat_df.columns if pd.api.types.is_numeric_dtype(feat_df[c])]
zero_var = [c for c in num_cols if feat_df[c].std(skipna=True) == 0]
print(f"Zero-variance numeric features: {len(zero_var)}")
if zero_var:
    print(zero_var[:20], "...")

# NA burden across numeric features (top 15)
na_burden = (feat_df[num_cols].isna().mean().sort_values(ascending=False)).head(15)
print("Top-15 NA ratios among numeric features:")
display(na_burden)
