In [None]:
import sys
from pathlib import Path
from sqlalchemy import create_engine, Column, Integer, String, Text, Float, Boolean, DateTime, ForeignKey, inspect
from sqlalchemy.orm import declarative_base, Session

print('Cell 1: Setup')

# Setup
DATA_PATH = Path.home() / 'datasens_project'
DATA_PATH.mkdir(parents=True, exist_ok=True)
RAW_DB_PATH = DATA_PATH / 'datasens.db'
RAW_DATABASE_URL = f'sqlite:///{RAW_DB_PATH.as_posix()}'

raw_engine = create_engine(RAW_DATABASE_URL, echo=False, connect_args={'check_same_thread': False})

# Create new Base (fresh registry)
Base = declarative_base()

print(f'âœ… Engine created: {RAW_DATABASE_URL}')

In [None]:
print('Cell 2: Define 6 tables')

# 1. SOURCE
class Source(Base):
    __tablename__ = 'source'
    source_id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String(100), unique=True, nullable=False)
    source_type = Column(String(50), nullable=False)
    url = Column(String(500))
    sync_frequency = Column(String(50), default='DAILY')
    last_sync_date = Column(DateTime)
    retry_policy = Column(String(50), default='SKIP')
    active = Column(Boolean, default=True)
    created_at = Column(DateTime)

# 2. RAW_DATA
class RawData(Base):
    __tablename__ = 'raw_data'
    raw_data_id = Column(Integer, primary_key=True, autoincrement=True)
    source_id = Column(Integer, ForeignKey('source.source_id'), nullable=False, index=True)
    title = Column(String(500), nullable=False)
    content = Column(Text, nullable=False)
    url = Column(String(500))
    fingerprint = Column(String(64), unique=True)
    published_at = Column(DateTime)
    collected_at = Column(DateTime, index=True)
    quality_score = Column(Float, default=0.5)

# 3. SYNC_LOG
class SyncLog(Base):
    __tablename__ = 'sync_log'
    sync_log_id = Column(Integer, primary_key=True, autoincrement=True)
    source_id = Column(Integer, ForeignKey('source.source_id'), nullable=False, index=True)
    sync_date = Column(DateTime, index=True)
    rows_synced = Column(Integer, default=0)
    status = Column(String(50), nullable=False)
    error_message = Column(Text)

# 4. TOPIC
class Topic(Base):
    __tablename__ = 'topic'
    topic_id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String(100), unique=True, nullable=False)
    keywords = Column(String(500))
    category = Column(String(50))
    active = Column(Boolean, default=True)

# 5. DOCUMENT_TOPIC
class DocumentTopic(Base):
    __tablename__ = 'document_topic'
    doc_topic_id = Column(Integer, primary_key=True, autoincrement=True)
    raw_data_id = Column(Integer, ForeignKey('raw_data.raw_data_id'), nullable=False, index=True)
    topic_id = Column(Integer, ForeignKey('topic.topic_id'), nullable=False, index=True)
    confidence_score = Column(Float, default=0.5)
    tagger = Column(String(100))

# 6. MODEL_OUTPUT
class ModelOutput(Base):
    __tablename__ = 'model_output'
    output_id = Column(Integer, primary_key=True, autoincrement=True)
    raw_data_id = Column(Integer, ForeignKey('raw_data.raw_data_id'), nullable=False, index=True)
    model_name = Column(String(100))
    label = Column(String(100))
    score = Column(Float, default=0.5)
    created_at = Column(DateTime)

print('âœ… 6 table models defined')

In [None]:
print('Cell 3: Create tables')

Base.metadata.create_all(raw_engine)

inspector = inspect(raw_engine)
tables = inspector.get_table_names()
print(f'âœ… {len(tables)} tables created')
for t in sorted(tables):
    print(f'  - {t}')

In [None]:
print('Cell 4: Insert 10 sources')

def insert_sources():
    with Session(raw_engine) as session:
        sources = [
            {'name': 'Le Monde RSS', 'source_type': 'RSS', 'url': 'https://www.lemonde.fr/rss'},
            {'name': 'France Info', 'source_type': 'RSS', 'url': 'https://www.franceinfo.fr/rss'},
            {'name': 'Ouest-France RSS', 'source_type': 'RSS', 'url': 'https://www.ouestfrance.fr/rss'},
            {'name': '20 Minutes RSS', 'source_type': 'RSS', 'url': 'https://www.20minutes.fr/rss'},
            {'name': 'LibÃ©ration RSS', 'source_type': 'RSS', 'url': 'https://www.liberation.fr/rss'},
            {'name': 'The Guardian API', 'source_type': 'API', 'url': 'https://api.theguardian.com'},
            {'name': 'BBC News RSS', 'source_type': 'RSS', 'url': 'https://www.bbc.com/news/rss.xml'},
            {'name': 'Reuters RSS', 'source_type': 'RSS', 'url': 'https://reuters.com/rss'},
            {'name': 'Euronews RSS', 'source_type': 'RSS', 'url': 'https://www.euronews.com/rss'},
            {'name': 'Politico EU', 'source_type': 'RSS', 'url': 'https://www.politico.eu/rss'},
        ]
        for s in sources:
            session.add(Source(**s))
            print(f'  âœ… {s["name"]}')
        session.commit()

insert_sources()
print(f'âœ… 10 sources inserted')
print(f'\nâœ…âœ…âœ… DATABASE READY FOR E1 âœ…âœ…âœ…')
print(f'ðŸ“Š Database: {RAW_DB_PATH}')
print(f'ðŸ“Œ Tables: 6 core')
print(f'ðŸ“° Sources: 10')