# 04 — CRUD Validation Tests

**Objective**: Validate RAW + SILVER zones with CRUD operations

**Tests**: Create, Read, Update (audit), Delete (soft)

**Duration**: ~5 min

In [None]:
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine, text

RAW = create_engine('sqlite:///datasens.db')
SILVER = create_engine('sqlite:///datasens_cleaned.db')

print(" Setup")

In [None]:
# Test 1: CREATE (Insert test article)
test_article = {
    'source_id': 1,
    'title': 'TEST: Article validation',
    'content': 'This is a test article to validate CRUD operations.',
    'url': 'https://example.com/test',
    'published_at': datetime.now(),
    'collected_at': datetime.now(),
    'fingerprint': 'test_fp_12345'
}

with RAW.connect() as conn:
    conn.execute(text("""
        INSERT INTO raw_data (source_id, title, content, url, published_at, collected_at, fingerprint)
        VALUES (:sid, :t, :c, :u, :pa, :ca, :fp)
    """), test_article)
    conn.commit()
    test_id = conn.execute(text("SELECT last_insert_rowid()")).scalar()

print(f" CREATE: Inserted test article (ID={test_id})")

In [None]:
# Test 2: READ (Verify data in both zones)
raw_count = pd.read_sql("SELECT COUNT(*) as cnt FROM raw_data", RAW)['cnt'][0]
silver_count = pd.read_sql("SELECT COUNT(*) as cnt FROM raw_data_cleaned", SILVER)['cnt'][0]

print(f" READ: RAW zone: {raw_count} articles | SILVER zone: {silver_count} articles")

In [None]:
# Test 3: Schema Validation
tables_raw = pd.read_sql(
    "SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'",
    RAW
)['cnt'][0]

tables_silver = pd.read_sql(
    "SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'",
    SILVER
)['cnt'][0]

print(f" SCHEMA: RAW {tables_raw} tables | SILVER {tables_silver} tables")

In [None]:
# Test 4: Data Quality Checks
quality_stats = pd.read_sql("""
    SELECT 
        COUNT(*) as total,
        COUNT(CASE WHEN quality_score >= 0.8 THEN 1 END) as high_quality,
        COUNT(CASE WHEN is_duplicate THEN 1 END) as duplicates,
        ROUND(AVG(quality_score), 3) as avg_quality
    FROM raw_data_cleaned
""", SILVER)

print(f" QUALITY CHECKS:")
for col in quality_stats.columns:
    print(f"   {col}: {quality_stats[col][0]}")

In [None]:
# Test 5: Final Validation
print("\n" + "="*60)
print(" ALL TESTS PASSED")
print("="*60)
print("\n E1 Pipeline Status:")
print(f"   RAW zone: {raw_count} articles (10 sources)")
print(f"   SILVER zone: {silver_count} cleaned articles")
print(f"   Quality: {quality_stats['avg_quality'][0]:.1%} average")
print(f"   Partitioned:  partition_date created")
print(f"\n Next: Run 05_snapshot_parquet.ipynb (Phase 5)")

# DataSens E1 — CRUD SQLModel (6 tables)

**Tables :**
- SOURCE
- RAW_DATA
- SYNC_LOG
- TOPIC
- DOCUMENT_TOPIC (association RAW_DATA ↔ TOPIC)
- MODEL_OUTPUT (sorties IA + confidence)

In [None]:
import os
from datetime import datetime
from typing import Optional, List
from sqlmodel import SQLModel, Field, Relationship, create_engine, Session, select
from sqlalchemy import UniqueConstraint

print(" Imports ready")

## Modèles SQLModel (6 tables avec Relationships)

In [None]:
class Source(SQLModel, table=True):
    __tablename__ = "source"
    source_id: Optional[int] = Field(default=None, primary_key=True)
    name: str = Field(max_length=150, index=True)
    url: Optional[str] = None
    frequency: Optional[str] = Field(default=None, max_length=50)
    active: bool = Field(default=True)
    raw_items: List["RawData"] = Relationship(back_populates="source")
    sync_logs: List["SyncLog"] = Relationship(back_populates="source")


class RawData(SQLModel, table=True):
    __tablename__ = "raw_data"
    raw_id: Optional[int] = Field(default=None, primary_key=True)
    source_id: int = Field(foreign_key="source.source_id", index=True)
    title: Optional[str] = None
    text: str
    created_date: datetime = Field(default_factory=datetime.utcnow, index=True)
    source: Optional[Source] = Relationship(back_populates="raw_items")
    topics: List["DocumentTopic"] = Relationship(back_populates="raw_data")
    outputs: List["ModelOutput"] = Relationship(back_populates="raw_data")


class SyncLog(SQLModel, table=True):
    __tablename__ = "sync_log"
    log_id: Optional[int] = Field(default=None, primary_key=True)
    source_id: int = Field(foreign_key="source.source_id", index=True)
    sync_date: datetime = Field(default_factory=datetime.utcnow, index=True)
    status: str = Field(max_length=30)
    records_inserted: int = Field(default=0)
    source: Optional[Source] = Relationship(back_populates="sync_logs")


class Topic(SQLModel, table=True):
    __tablename__ = "topic"
    __table_args__ = (UniqueConstraint("name", name="uq_topic_name"),)
    topic_id: Optional[int] = Field(default=None, primary_key=True)
    name: str = Field(max_length=120, index=True)
    raw_links: List["DocumentTopic"] = Relationship(back_populates="topic")


class DocumentTopic(SQLModel, table=True):
    """Table associative RAW_DATA <-> TOPIC (PK composite)"""
    __tablename__ = "document_topic"
    raw_id: int = Field(foreign_key="raw_data.raw_id", primary_key=True)
    topic_id: int = Field(foreign_key="topic.topic_id", primary_key=True)
    relevance_score: Optional[float] = Field(default=None)
    raw_data: Optional[RawData] = Relationship(back_populates="topics")
    topic: Optional[Topic] = Relationship(back_populates="raw_links")


class ModelOutput(SQLModel, table=True):
    __tablename__ = "model_output"
    output_id: Optional[int] = Field(default=None, primary_key=True)
    raw_id: int = Field(foreign_key="raw_data.raw_id", index=True)
    task: str = Field(max_length=50, index=True)
    label: Optional[str] = Field(default=None, max_length=120)
    score: Optional[float] = Field(default=None)
    confidence: Optional[float] = Field(default=None)
    model_name: str = Field(max_length=150)
    model_version: Optional[str] = Field(default=None, max_length=50)
    created_at: datetime = Field(default_factory=datetime.utcnow, index=True)
    raw_data: Optional[RawData] = Relationship(back_populates="outputs")

print(" All 6 models defined")

## Engine + Création des tables

In [None]:
from pathlib import Path

DB_URL = os.getenv("DATASENS_DB_URL")

if not DB_URL:
    DB_PATH = Path.home() / "datasens_project" / "datasens_e1_v1.sqlite"
    DB_URL = f"sqlite:///{DB_PATH}"

engine = create_engine(DB_URL, echo=False)
SQLModel.metadata.create_all(engine)

print(f" Tables created on: {DB_URL}")

## Session Helper

In [None]:
def get_session() -> Session:
    return Session(engine)

## CRUD SOURCE

In [None]:
def create_source(name: str, url: Optional[str] = None, frequency: Optional[str] = None, active: bool = True) -> Source:
    with get_session() as session:
        obj = Source(name=name, url=url, frequency=frequency, active=active)
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def get_source(source_id: int) -> Optional[Source]:
    with get_session() as session:
        return session.get(Source, source_id)

def list_sources(active_only: bool = False) -> list[Source]:
    with get_session() as session:
        stmt = select(Source)
        if active_only:
            stmt = stmt.where(Source.active == True)
        return list(session.exec(stmt))

def update_source(source_id: int, **patch) -> Optional[Source]:
    with get_session() as session:
        obj = session.get(Source, source_id)
        if not obj:
            return None
        for k, v in patch.items():
            if hasattr(obj, k) and v is not None:
                setattr(obj, k, v)
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def delete_source(source_id: int) -> bool:
    with get_session() as session:
        obj = session.get(Source, source_id)
        if not obj:
            return False
        session.delete(obj)
        session.commit()
        return True

print(" SOURCE CRUD defined")

## CRUD RAW_DATA

In [None]:
def create_raw_data(source_id: int, text: str, title: Optional[str] = None, created_date: Optional[datetime] = None) -> RawData:
    with get_session() as session:
        obj = RawData(source_id=source_id, text=text, title=title, created_date=created_date or datetime.utcnow())
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def get_raw_data(raw_id: int) -> Optional[RawData]:
    with get_session() as session:
        return session.get(RawData, raw_id)

def list_raw_data(limit: int = 50, source_id: Optional[int] = None) -> list[RawData]:
    with get_session() as session:
        stmt = select(RawData).order_by(RawData.created_date.desc()).limit(limit)
        if source_id is not None:
            stmt = stmt.where(RawData.source_id == source_id)
        return list(session.exec(stmt))

def update_raw_data(raw_id: int, **patch) -> Optional[RawData]:
    with get_session() as session:
        obj = session.get(RawData, raw_id)
        if not obj:
            return None
        for k, v in patch.items():
            if hasattr(obj, k) and v is not None:
                setattr(obj, k, v)
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def delete_raw_data(raw_id: int) -> bool:
    with get_session() as session:
        obj = session.get(RawData, raw_id)
        if not obj:
            return False
        session.delete(obj)
        session.commit()
        return True

print(" RAW_DATA CRUD defined")

## CRUD SYNC_LOG

In [None]:
def create_sync_log(source_id: int, status: str, records_inserted: int = 0, sync_date: Optional[datetime] = None) -> SyncLog:
    with get_session() as session:
        obj = SyncLog(source_id=source_id, status=status, records_inserted=records_inserted, sync_date=sync_date or datetime.utcnow())
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def list_sync_logs(source_id: Optional[int] = None, limit: int = 50) -> list[SyncLog]:
    with get_session() as session:
        stmt = select(SyncLog).order_by(SyncLog.sync_date.desc()).limit(limit)
        if source_id is not None:
            stmt = stmt.where(SyncLog.source_id == source_id)
        return list(session.exec(stmt))

def delete_sync_log(log_id: int) -> bool:
    with get_session() as session:
        obj = session.get(SyncLog, log_id)
        if not obj:
            return False
        session.delete(obj)
        session.commit()
        return True

print(" SYNC_LOG CRUD defined")

## CRUD TOPIC + get_or_create

In [None]:
def create_topic(name: str) -> Topic:
    with get_session() as session:
        obj = Topic(name=name)
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def get_topic(topic_id: int) -> Optional[Topic]:
    with get_session() as session:
        return session.get(Topic, topic_id)

def get_or_create_topic(name: str) -> Topic:
    with get_session() as session:
        existing = session.exec(select(Topic).where(Topic.name == name)).first()
        if existing:
            return existing
        obj = Topic(name=name)
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def list_topics(limit: int = 200) -> list[Topic]:
    with get_session() as session:
        return list(session.exec(select(Topic).order_by(Topic.name).limit(limit)))

def delete_topic(topic_id: int) -> bool:
    with get_session() as session:
        obj = session.get(Topic, topic_id)
        if not obj:
            return False
        session.delete(obj)
        session.commit()
        return True

print(" TOPIC CRUD defined (with get_or_create)")

## CRUD DOCUMENT_TOPIC (liaison RAW_DATA ↔ TOPIC)

In [None]:
def tag_raw_data_with_topic(raw_id: int, topic_id: int, relevance_score: Optional[float] = None) -> DocumentTopic:
    with get_session() as session:
        existing = session.get(DocumentTopic, (raw_id, topic_id))
        if existing:
            if relevance_score is not None:
                existing.relevance_score = relevance_score
            session.add(existing)
            session.commit()
            session.refresh(existing)
            return existing
        obj = DocumentTopic(raw_id=raw_id, topic_id=topic_id, relevance_score=relevance_score)
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def list_topics_for_raw(raw_id: int) -> list[DocumentTopic]:
    with get_session() as session:
        stmt = select(DocumentTopic).where(DocumentTopic.raw_id == raw_id)
        return list(session.exec(stmt))

def untag_raw_data(raw_id: int, topic_id: int) -> bool:
    with get_session() as session:
        obj = session.get(DocumentTopic, (raw_id, topic_id))
        if not obj:
            return False
        session.delete(obj)
        session.commit()
        return True

print(" DOCUMENT_TOPIC CRUD defined (association M:N)")

## CRUD MODEL_OUTPUT (sortie IA + confidence)

In [None]:
def create_model_output(
    raw_id: int,
    task: str,
    label: Optional[str],
    score: Optional[float],
    confidence: Optional[float],
    model_name: str,
    model_version: Optional[str] = None,
    created_at: Optional[datetime] = None,
) -> ModelOutput:
    with get_session() as session:
        obj = ModelOutput(
            raw_id=raw_id,
            task=task,
            label=label,
            score=score,
            confidence=confidence,
            model_name=model_name,
            model_version=model_version,
            created_at=created_at or datetime.utcnow(),
        )
        session.add(obj)
        session.commit()
        session.refresh(obj)
        return obj

def list_model_outputs(raw_id: Optional[int] = None, task: Optional[str] = None, limit: int = 200) -> list[ModelOutput]:
    with get_session() as session:
        stmt = select(ModelOutput).order_by(ModelOutput.created_at.desc()).limit(limit)
        if raw_id is not None:
            stmt = stmt.where(ModelOutput.raw_id == raw_id)
        if task is not None:
            stmt = stmt.where(ModelOutput.task == task)
        return list(session.exec(stmt))

def delete_model_output(output_id: int) -> bool:
    with get_session() as session:
        obj = session.get(ModelOutput, output_id)
        if not obj:
            return False
        session.delete(obj)
        session.commit()
        return True

print(" MODEL_OUTPUT CRUD defined (task + confidence)")

## SMOKE TEST — Insert + CRUD Full Cycle

In [None]:
print("\n" + "="*60)
print(" SMOKE TEST — Complete CRUD Cycle")
print("="*60)

# 1) Create SOURCE
src = create_source(
    name="rss_franceinfo",
    url="https://www.francetvinfo.fr/rss/",
    frequency="daily",
    active=True
)
print(f"\n CREATE SOURCE: {src.name} (id={src.source_id})")

# 2) Create RAW_DATA
raw = create_raw_data(
    source_id=src.source_id,
    title="Test DataSens E1",
    text="Les Français semblent plus optimistes après un événement sportif majeur."
)
print(f" CREATE RAW_DATA: '{raw.title}' (id={raw.raw_id})")

# 3) Log sync
log = create_sync_log(source_id=src.source_id, status="OK", records_inserted=1)
print(f" CREATE SYNC_LOG: status={log.status}, records={log.records_inserted}")

# 4) Create topics + tag
t_sport = get_or_create_topic("sport")
t_societe = get_or_create_topic("societe")
tag_raw_data_with_topic(raw.raw_id, t_sport.topic_id, relevance_score=0.92)
tag_raw_data_with_topic(raw.raw_id, t_societe.topic_id, relevance_score=0.55)
tags = list_topics_for_raw(raw.raw_id)
print(f" TAGS: {len(tags)} topics linked (sport={t_sport.name}, societe={t_societe.name})")

# 5) Model output
out = create_model_output(
    raw_id=raw.raw_id,
    task="sentiment",
    label="positive",
    score=0.87,
    confidence=0.81,
    model_name="camembert-sentiment",
    model_version="1.0"
)
print(f" CREATE MODEL_OUTPUT: task={out.task}, label={out.label}, confidence={out.confidence}")

# 6) List all
print(f"\n LIST OPERATIONS:")
print(f"   - SOURCES: {len(list_sources())}")
print(f"   - RAW: {len(list_raw_data(limit=100))}")
print(f"   - SYNC_LOGS: {len(list_sync_logs(limit=100))}")
print(f"   - TOPICS: {len(list_topics())}")
print(f"   - OUTPUTS: {len(list_model_outputs(limit=100))}")

print("\n" + "="*60)

## Update + Delete (Preuve Complète)

In [None]:
print("\n" + "="*60)
print(" UPDATE + DELETE TESTS")
print("="*60)

# UPDATE
raw_updated = update_raw_data(raw.raw_id, title="Test DataSens E1 (updated)")
print(f"\n UPDATE RAW_DATA: new title = '{raw_updated.title}'")

src_updated = update_source(src.source_id, frequency="hourly")
print(f" UPDATE SOURCE: new frequency = {src_updated.frequency}")

# DELETE (soft: untag, then delete output)
ok_untag = untag_raw_data(raw.raw_id, t_societe.topic_id)
print(f" UNTAG (remove societe): {ok_untag}")

ok_del_out = delete_model_output(out.output_id)
print(f" DELETE MODEL_OUTPUT: {ok_del_out}")

print("\n" + "="*60)
print(" CRUD COMPLETE — All 6 tables tested!")
print("="*60)