# 01 – Iteration 3: Data Cleaning for Dataset 2 (Repte Consums Anòmals)

This notebook:

1. Loads the **Dataset 2 – "Consum anomalies facturacio complet_anonymized.parquet"**.
2. Saves a **raw CSV copy** in `data/derived/` for easier inspection and debugging.
3. Applies the Iteration 3 basic cleaning rules using `src/cleaning.py`.
4. Saves the **cleaned dataset** (CSV + Parquet) and a JSON summary of cleaning actions in:
   - `iteration_3/results/cleaned/`

The cleaned dataset will be the starting point for:
- `02_iter3_feature_engineering.ipynb`
- `03_iter3_feature_selection.ipynb`
- `04_iter3_dataset_preparation.ipynb`

Imports & paths

In [None]:
# Standard imports
import os
import json
import sys

import pandas as pd

# Make sure Python can find the src/ package
SRC_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "src"))
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

from cleaning import load_dataset2, apply_basic_cleaning, save_cleaned_outputs

# Project root = one level above iteration_3/
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
DERIVED_DIR = os.path.join(DATA_DIR, "derived")

os.makedirs(DERIVED_DIR, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR    :", DATA_DIR)
print("DERIVED_DIR :", DERIVED_DIR)


Define dataset paths

In [None]:
# Name of the new Dataset 2 Parquet file
RAW_PARQUET_NAME = "Consum anomalies facturacio complet_anonymized.parquet"

RAW_PARQUET_PATH = os.path.join(DATA_DIR, RAW_PARQUET_NAME)

# CSV copy of the raw dataset for manual inspection
RAW_CSV_DERIVED_PATH = os.path.join(
    DERIVED_DIR, "dataset2_raw_iter3_from_parquet.csv"
)

# Directory for cleaned outputs inside iteration_3
CLEANED_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "results", "cleaned"))
os.makedirs(CLEANED_DIR, exist_ok=True)

print("RAW_PARQUET_PATH      :", RAW_PARQUET_PATH)
print("RAW_CSV_DERIVED_PATH  :", RAW_CSV_DERIVED_PATH)
print("CLEANED_DIR           :", CLEANED_DIR)

# Quick existence check
if not os.path.exists(RAW_PARQUET_PATH):
    raise FileNotFoundError(f"Raw parquet file not found: {RAW_PARQUET_PATH}")


Load raw parquet & basic inspection

In [None]:
# 1) Load raw Dataset 2 from Parquet
df_raw = load_dataset2(RAW_PARQUET_PATH)

print("Raw dataset loaded.")
print("Shape:", df_raw.shape)
display(df_raw.head())


Save raw CSV copy in data/derived/

In [None]:
# 2) Save a raw CSV copy in data/derived/ for easier inspection
df_raw.to_csv(RAW_CSV_DERIVED_PATH, index=False)
print(f"[ok] Saved raw CSV copy to: {RAW_CSV_DERIVED_PATH}")


Apply cleaning rules (using cleaning.py)

In [None]:
# 3) Apply Iteration 3 basic cleaning rules
df_clean, cleaning_changes = apply_basic_cleaning(df_raw)

print("Cleaned dataset created.")
print("Cleaned shape:", df_clean.shape)

# Quick sanity checks
display(df_clean.head())
print("\nCleaning summary (first-level keys):")
for k in cleaning_changes.keys():
    print(f"  - {k}: {cleaning_changes[k]}")


Save cleaned dataset + JSON summary

In [None]:
# 4) Persist cleaned dataset and cleaning summary
BASE_NAME = "dataset2_cleaned_iter3"

save_cleaned_outputs(
    clean_df=df_clean,
    changes=cleaning_changes,
    out_dir=CLEANED_DIR,
    base_name=BASE_NAME,
)

# For convenience, show final paths
clean_csv_path = os.path.join(CLEANED_DIR, f"{BASE_NAME}.csv")
clean_parquet_path = os.path.join(CLEANED_DIR, f"{BASE_NAME}.parquet")
clean_json_path = os.path.join(CLEANED_DIR, f"{BASE_NAME}_cleaning_changes.json")

print("\nFinal outputs:")
print("  Clean CSV   :", clean_csv_path)
print("  Clean Parquet:", clean_parquet_path)
print("  Summary JSON :", clean_json_path)



## Quick descriptive stats (for documentation)

The cell below computes a few basic statistics useful for the Iteration 3 report:
- number of rows / columns after cleaning
- anomaly label distribution (if available)
- negative / zero consumption counts


In [None]:
print("Cleaned dataset shape:", df_clean.shape)

# Try to find anomaly-related columns
anom_cols = [c for c in df_clean.columns if "anom" in c.lower()]
print("Anomaly-related columns:", anom_cols)

if anom_cols:
    print("\nValue counts for first anomaly column:")
    display(df_clean[anom_cols[0]].value_counts(dropna=False).head(20))

for col in ["flag_negative_consumption", "flag_zero_consumption",
            "flag_anom_32768", "flag_anom_163840"]:
    if col in df_clean.columns:
        print(f"{col}: {df_clean[col].sum()} rows True")
