## Setup & Database Initialization

In [None]:
from sqlalchemy import create_engine, Column, Integer, String, Text, Float, Boolean, DateTime, Date, ForeignKey, Index, UniqueConstraint, CheckConstraint, inspect
from sqlalchemy.orm import declarative_base, Session
from datetime import datetime
from pathlib import Path

# Database paths
DATA_PATH = Path.home() / "datasens_project"
DATA_PATH.mkdir(parents=True, exist_ok=True)

RAW_DB_PATH = DATA_PATH / "datasens.db"

# Use .as_posix() for Windows compatibility
RAW_DATABASE_URL = f"sqlite:///{RAW_DB_PATH.as_posix()}"

# Create engine with thread safety
raw_engine = create_engine(RAW_DATABASE_URL, echo=False, connect_args={"check_same_thread": False})

print(f"âœ… RAW engine initialized: {RAW_DATABASE_URL}")

Base = declarative_base()

## Table Definitions (6 Core Tables)

In [None]:
# 1. SOURCE - Registry of news sources
class Source(Base):
    __tablename__ = "source"
    
    source_id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String(100), unique=True, nullable=False)
    source_type = Column(String(50), nullable=False)  # RSS, API, WEB
    url = Column(String(500))
    sync_frequency = Column(String(50), default='DAILY')
    last_sync_date = Column(DateTime)
    retry_policy = Column(String(50), default='SKIP')
    active = Column(Boolean, default=True)
    created_at = Column(DateTime)

# 2. RAW_DATA - Core fact table (all ingested records)
class RawData(Base):
    __tablename__ = "raw_data"
    
    raw_data_id = Column(Integer, primary_key=True, autoincrement=True)
    source_id = Column(Integer, ForeignKey('source.source_id'), nullable=False, index=True)
    title = Column(String(500), nullable=False)
    content = Column(Text, nullable=False)
    url = Column(String(500))
    fingerprint = Column(String(64), unique=True)  # SHA256 for deduplication
    published_at = Column(DateTime)
    collected_at = Column(DateTime, index=True)
    quality_score = Column(Float, default=0.5)

# 3. SYNC_LOG - Audit trail for ingestion
class SyncLog(Base):
    __tablename__ = "sync_log"
    
    sync_log_id = Column(Integer, primary_key=True, autoincrement=True)
    source_id = Column(Integer, ForeignKey('source.source_id'), nullable=False, index=True)
    sync_date = Column(DateTime, index=True)
    rows_synced = Column(Integer, default=0)
    status = Column(String(50), nullable=False)  # success, partial, failed
    error_message = Column(Text)

# 4. TOPIC - Semantic categories
class Topic(Base):
    __tablename__ = "topic"
    
    topic_id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String(100), unique=True, nullable=False)  # climate, politics, economy...
    keywords = Column(String(500))
    category = Column(String(50))
    active = Column(Boolean, default=True)

# 5. DOCUMENT_TOPIC - M:N relationship (articles â†” topics)
class DocumentTopic(Base):
    __tablename__ = "document_topic"
    
    doc_topic_id = Column(Integer, primary_key=True, autoincrement=True)
    raw_data_id = Column(Integer, ForeignKey('raw_data.raw_data_id'), nullable=False, index=True)
    topic_id = Column(Integer, ForeignKey('topic.topic_id'), nullable=False, index=True)
    confidence_score = Column(Float, default=0.5)
    tagger = Column(String(100))  # FlauBERT, Zero-shot, etc.

# 6. MODEL_OUTPUT - AI predictions (sentiment, topic detection)
class ModelOutput(Base):
    __tablename__ = "model_output"
    
    output_id = Column(Integer, primary_key=True, autoincrement=True)
    raw_data_id = Column(Integer, ForeignKey('raw_data.raw_data_id'), nullable=False, index=True)
    model_name = Column(String(100))  # flaubert-base, zero-shot, etc.
    label = Column(String(100))  # sentiment, bias, etc.
    score = Column(Float, default=0.5)
    created_at = Column(DateTime)

print("âœ… 6 core table models defined")

## Create Tables in Database

In [None]:
# Create all tables
Base.metadata.create_all(raw_engine)
print("âœ… All tables created in RAW database")

# Validate schema
raw_inspector = inspect(raw_engine)
raw_tables = raw_inspector.get_table_names()

print(f"\nðŸ“Š RAW zone tables ({len(raw_tables)}):")
for table_name in sorted(raw_tables):
    print(f"   âœ“ {table_name}")

print("\nâœ… Schema ready for data ingestion")

## Insert 10 News Sources

In [None]:
def insert_sources():
    with Session(raw_engine) as session:
        sources_data = [
            {"name": "Le Monde RSS", "source_type": "RSS", "url": "https://www.lemonde.fr/rss"},
            {"name": "France Info", "source_type": "RSS", "url": "https://www.franceinfo.fr/rss"},
            {"name": "Ouest-France RSS", "source_type": "RSS", "url": "https://www.ouestfrance.fr/rss"},
            {"name": "20 Minutes RSS", "source_type": "RSS", "url": "https://www.20minutes.fr/rss"},
            {"name": "LibÃ©ration RSS", "source_type": "RSS", "url": "https://www.liberation.fr/rss"},
            {"name": "The Guardian API", "source_type": "API", "url": "https://api.theguardian.com"},
            {"name": "BBC News RSS", "source_type": "RSS", "url": "https://www.bbc.com/news/rss.xml"},
            {"name": "Reuters RSS", "source_type": "RSS", "url": "https://reuters.com/rss"},
            {"name": "Euronews RSS", "source_type": "RSS", "url": "https://www.euronews.com/rss"},
            {"name": "Politico EU", "source_type": "RSS", "url": "https://www.politico.eu/rss"},
        ]
        
        for source_data in sources_data:
            source = Source(**source_data)
            session.add(source)
            print(f"  âœ… {source_data['name']}")
        
        session.commit()
        print(f"\nâœ… {len(sources_data)} sources inserted successfully")

print("Inserting 10 news sources into RAW database...\n")
insert_sources()

print("\n" + "="*70)
print("âœ… DATABASE INITIALIZATION COMPLETE")
print("="*70)
print(f"ðŸ“Š RAW database: {RAW_DB_PATH}")
print(f"ðŸ“Œ Tables: SOURCE, RAW_DATA, SYNC_LOG, TOPIC, DOCUMENT_TOPIC, MODEL_OUTPUT")
print(f"ðŸ“° Sources configured: 10")
print(f"âœ… Ready for E1 ingestion pipeline")
print("="*70)