# 03d — Data Cleaning Pipeline (10-Step)

**Objective**: RAW → SILVER (with partition_date + quality_score)

**Steps**:
1. Load RAW data + profile
2. Normalize + parse dates
3. Handle nulls + text cleaning
4. Quality scoring + filtering
5. Fingerprint dedup
6. Insert SILVER + partition_date
7. Log cleaning_audit

In [None]:
import pandas as pd, numpy as np
from datetime import datetime, timedelta
import hashlib, re
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

load_dotenv()
RAW = create_engine('sqlite:///datasens.db')
SILVER = create_engine('sqlite:///datasens_cleaned.db')

print(" Ready")

## Step 3-5: Quality Scoring + Dedup

In [None]:
# Step 1-2: Load + Normalize
df = pd.read_sql("SELECT * FROM raw_data", RAW)

# Normalize: strip, lowercase title, parse dates
df['title'] = df['title'].fillna('').str.strip()
df['content'] = df['content'].fillna('').str.strip()
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')
df['partition_date'] = df['published_at'].dt.date

print(f" Loaded {len(df)} articles")

## Step 6-7: Insert SILVER + Audit

In [None]:
def quality_score(row):
    """Calculate quality 0-1 based on completeness + length"""
    score = 0.5  # baseline
    if pd.notna(row['title']) and len(str(row['title']).strip()) > 10: score += 0.2
    if pd.notna(row['content']) and len(str(row['content']).strip()) > 50: score += 0.3
    return min(score, 1.0)

def is_duplicate(row, seen):
    """Check fingerprint"""
    fp = hashlib.sha256(f"{row['title']}{row['content']}".lower().encode()).hexdigest()
    if fp in seen: return True
    seen.add(fp)
    return False

# Quality scoring
df['quality_score'] = df.apply(quality_score, axis=1)

# Dedup + filter
seen = set()
df['is_duplicate'] = df.apply(lambda r: is_duplicate(r, seen), axis=1)
df_clean = df[~df['is_duplicate'] & (df['quality_score'] >= 0.5)].copy()

print(f" Quality filtered: {len(df_clean)}/{len(df)} articles")
print(f"   - Min quality: {df_clean['quality_score'].min():.2f}")
print(f"   - Avg quality: {df_clean['quality_score'].mean():.2f}")

## Step 8-10: Summary & Validation

In [None]:
# Validation
summary = pd.read_sql("""
    SELECT 
        s.name as source,
        COUNT(*) as total,
        COUNT(CASE WHEN quality_score >= 0.8 THEN 1 END) as high_quality,
        ROUND(AVG(quality_score), 3) as avg_quality,
        COUNT(DISTINCT partition_date) as date_range
    FROM raw_data_cleaned rc
    JOIN source s ON rc.source_id = s.source_id
    GROUP BY s.name
    ORDER BY total DESC
""", SILVER)

print("\n SILVER Zone Statistics:")
print(summary.to_string(index=False))

total = pd.read_sql("SELECT COUNT(*) as total FROM raw_data_cleaned", SILVER)
print(f"\n Total SILVER articles: {total['total'][0]}")
print(" Next: Run 04_crud_tests.ipynb")