In [None]:
# Setup: Imports & Config
import os
import sys
import json
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import numpy as np
import hashlib
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

# Load environment
load_dotenv()

# Database engines
RAW_DB = 'sqlite:///datasens.db'
engine_raw = create_engine(RAW_DB)

# Directories
DATA_RAW = Path('data/raw')
DATA_RAW.mkdir(parents=True, exist_ok=True)

print(" Setup complete. Ready to ingest sources.")

In [None]:
# UTILITY FUNCTIONS

def compute_fingerprint(title: str, content: str) -> str:
    """SHA256 fingerprint for deduplication"""
    combined = f"{title.strip()}{content.strip()}".lower()
    return hashlib.sha256(combined.encode()).hexdigest()

def store_raw_data(source_id: int, source_name: str, articles: list, date_str: str):
    """
    Store articles in raw_data table with deduplication
    
    Args:
        source_id: ID of the source
        source_name: Name of the source (for folder structure)
        articles: List of dicts with keys: title, content, url, published_at
        date_str: YYYY-MM-DD format
    """
    with engine_raw.connect() as conn:
        for article in articles:
            fingerprint = compute_fingerprint(article['title'], article['content'])
            
            # Check if already exists
            existing = conn.execute(
                text("SELECT raw_data_id FROM raw_data WHERE fingerprint = :fp"),
                {"fp": fingerprint}
            ).fetchone()
            
            if not existing:
                conn.execute(
                    text("""
                        INSERT INTO raw_data 
                        (source_id, title, content, url, published_at, collected_at, fingerprint)
                        VALUES (:src_id, :title, :content, :url, :pub_at, :col_at, :fp)
                    """),
                    {
                        "src_id": source_id,
                        "title": article.get('title', '')[:500],
                        "content": article.get('content', '')[:5000],
                        "url": article.get('url', '')[:1000],
                        "pub_at": article.get('published_at', datetime.now()),
                        "col_at": datetime.now(),
                        "fp": fingerprint
                    }
                )
        conn.commit()

def log_sync(source_id: int, source_name: str, row_count: int, status: str = "success"):
    """Log sync operation"""
    with engine_raw.connect() as conn:
        conn.execute(
            text("""
                INSERT INTO sync_log (source_id, sync_date, rows_synced, status, error_msg)
                VALUES (:src_id, :sync_date, :rows, :status, NULL)
            """),
            {
                "src_id": source_id,
                "sync_date": datetime.now(),
                "rows": row_count,
                "status": status
            }
        )
        conn.commit()

print(" Utility functions defined.")

In [None]:
#  SOURCE 1: RSS_FRENCH_NEWS (Feedparser)
import feedparser

print("\n" + "="*60)
print("  RSS_FRENCH_NEWS (Feedparser)")
print("="*60)

# Register source
with engine_raw.connect() as conn:
    result = conn.execute(
        text("SELECT source_id FROM source WHERE name = 'rss_french_news'")
    ).fetchone()
    
    if not result:
        conn.execute(
            text("""
                INSERT INTO source (name, source_type, url, description)
                VALUES ('rss_french_news', 'RSS', 'https://www.france24.com/fr/en-direct/rss', 
                        'News from France24 RSS feed')
            """)
        )
        conn.commit()
        result = conn.execute(
            text("SELECT source_id FROM source WHERE name = 'rss_french_news'")
        ).fetchone()
    
    source_id = result[0]

# Fetch RSS
try:
    feed_url = "https://www.france24.com/fr/en-direct/rss"
    feed = feedparser.parse(feed_url)
    
    articles = []
    for entry in feed.entries[:50]:  # Limit to 50 for speed
        articles.append({
            'title': entry.get('title', 'N/A'),
            'content': entry.get('summary', '')[:1000],
            'url': entry.get('link', ''),
            'published_at': datetime(*entry.get('published_parsed', datetime.now().timetuple())[:6]) 
                           if 'published_parsed' in entry else datetime.now()
        })
    
    store_raw_data(source_id, 'rss_french_news', articles, datetime.now().strftime("%Y-%m-%d"))
    log_sync(source_id, 'rss_french_news', len(articles), 'success')
    
    print(f" Ingested {len(articles)} articles from rss_french_news")
except Exception as e:
    print(f" Error: {e}")
    log_sync(source_id, 'rss_french_news', 0, f'error: {str(e)}')

In [None]:
#  SOURCE 2: GDELT_EVENTS (API)
import requests

print("\n" + "="*60)
print("  GDELT_EVENTS (API)")
print("="*60)

# Register source
with engine_raw.connect() as conn:
    result = conn.execute(
        text("SELECT source_id FROM source WHERE name = 'gdelt_events'")
    ).fetchone()
    
    if not result:
        conn.execute(
            text("""
                INSERT INTO source (name, source_type, url, description)
                VALUES ('gdelt_events', 'API', 'https://api.gdeltproject.org/api/v2/', 
                        'Global Event Data Lab - Event database')
            """)
        )
        conn.commit()
        result = conn.execute(
            text("SELECT source_id FROM source WHERE name = 'gdelt_events'")
        ).fetchone()
    
    source_id = result[0]

try:
    # GDELT API query
    url = "https://api.gdeltproject.org/api/v2/search/tv"
    params = {
        "keyword": "France OR French",
        "format": "json",
        "orderby": "date",
        "mode": "ArtList",
        "maxrecords": 100
    }
    
    response = requests.get(url, params=params, timeout=10)
    data = response.json()
    
    articles = []
    if 'articles' in data:
        for article in data['articles'][:50]:
            articles.append({
                'title': article.get('title', 'N/A'),
                'content': article.get('description', '')[:1000],
                'url': article.get('url', ''),
                'published_at': pd.to_datetime(article.get('dateadded', datetime.now()))
            })
    
    store_raw_data(source_id, 'gdelt_events', articles, datetime.now().strftime("%Y-%m-%d"))
    log_sync(source_id, 'gdelt_events', len(articles), 'success')
    
    print(f" Ingested {len(articles)} events from GDELT")
except Exception as e:
    print(f" Error: {e}")
    log_sync(source_id, 'gdelt_events', 0, f'error: {str(e)}')

In [None]:
#  SOURCE 3: OPENWEATHER_API (Requires key)

print("\n" + "="*60)
print("  OPENWEATHER_API")
print("="*60)

# Register source
with engine_raw.connect() as conn:
    result = conn.execute(
        text("SELECT source_id FROM source WHERE name = 'openweather_api'")
    ).fetchone()
    
    if not result:
        conn.execute(
            text("""
                INSERT INTO source (name, source_type, url, description)
                VALUES ('openweather_api', 'API', 'https://openweathermap.org/api', 
                        'OpenWeather API - Weather data for French cities')
            """)
        )
        conn.commit()
        result = conn.execute(
            text("SELECT source_id FROM source WHERE name = 'openweather_api'")
        ).fetchone()
    
    source_id = result[0]

try:
    api_key = os.getenv('OPENWEATHER_API_KEY', '')
    
    if not api_key:
        print("  OPENWEATHER_API_KEY not in .env. Using MOCK data.")
        # MOCK: Simulate weather data for French cities
        cities = ['Paris', 'Lyon', 'Marseille', 'Toulouse', 'Nice']
        articles = []
        for city in cities:
            for day in range(5):
                date = datetime.now() - timedelta(days=day)
                articles.append({
                    'title': f"Weather {city} — {date.strftime('%Y-%m-%d')}",
                    'content': f"Temperature: {np.random.randint(0, 30)}°C, Condition: {'Sunny' if np.random.random() > 0.5 else 'Rainy'}",
                    'url': f"https://openweathermap.org/{city}",
                    'published_at': date
                })
    else:
        # Real API call
        cities = [('Paris', 48.8566, 2.3522), ('Lyon', 45.7640, 4.8357)]
        articles = []
        
        for city, lat, lon in cities:
            url = "https://api.openweathermap.org/data/2.5/weather"
            params = {"lat": lat, "lon": lon, "appid": api_key, "units": "metric"}
            response = requests.get(url, params=params, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                articles.append({
                    'title': f"Weather {city} — {datetime.now().strftime('%Y-%m-%d')}",
                    'content': f"Temp: {data['main']['temp']}°C, {data['weather'][0]['description']}",
                    'url': f"https://openweathermap.org/{city}",
                    'published_at': datetime.now()
                })
    
    store_raw_data(source_id, 'openweather_api', articles, datetime.now().strftime("%Y-%m-%d"))
    log_sync(source_id, 'openweather_api', len(articles), 'success')
    
    print(f" Ingested {len(articles)} weather records")
except Exception as e:
    print(f" Error: {e}")
    log_sync(source_id, 'openweather_api', 0, f'error: {str(e)}')

In [None]:
#  SOURCE 4: INSEE_API (Public)

print("\n" + "="*60)
print("  INSEE_API (Public data)")
print("="*60)

# Register source
with engine_raw.connect() as conn:
    result = conn.execute(
        text("SELECT source_id FROM source WHERE name = 'insee_api'")
    ).fetchone()
    
    if not result:
        conn.execute(
            text("""
                INSERT INTO source (name, source_type, url, description)
                VALUES ('insee_api', 'API', 'https://www.insee.fr/api/', 
                        'INSEE - French statistical agency data')
            """)
        )
        conn.commit()
        result = conn.execute(
            text("SELECT source_id FROM source WHERE name = 'insee_api'")
        ).fetchone()
    
    source_id = result[0]

try:
    # INSEE public data endpoint
    url = "https://api.insee.fr/catalogue/datasets"
    
    # Using MOCK for simplicity (INSEE requires auth for full data)
    print("  INSEE API requires authentication. Using MOCK statistical data.")
    
    articles = []
    for month in range(1, 13):
        articles.append({
            'title': f"INSEE Stats — 2025-{month:02d}",
            'content': f"Monthly economic indicators: Employment {90 + np.random.randint(-2, 3)}%, Inflation {2.1 + np.random.random()}%",
            'url': "https://www.insee.fr/",
            'published_at': datetime(2025, month, 1)
        })
    
    store_raw_data(source_id, 'insee_api', articles, datetime.now().strftime("%Y-%m-%d"))
    log_sync(source_id, 'insee_api', len(articles), 'success')
    
    print(f" Ingested {len(articles)} INSEE statistics")
except Exception as e:
    print(f" Error: {e}")
    log_sync(source_id, 'insee_api', 0, f'error: {str(e)}')

In [None]:
#  SOURCE 5: KAGGLE_FRENCH_OPINIONS (Manual CSV OR MOCK)

print("\n" + "="*60)
print("  KAGGLE_FRENCH_OPINIONS (CSV OR MOCK)")
print("="*60)

# Register source
with engine_raw.connect() as conn:
    result = conn.execute(
        text("SELECT source_id FROM source WHERE name = 'kaggle_french_opinions'")
    ).fetchone()
    
    if not result:
        conn.execute(
            text("""
                INSERT INTO source (name, source_type, url, description)
                VALUES ('kaggle_french_opinions', 'Dataset', 'https://www.kaggle.com/', 
                        'Kaggle - French product reviews & opinions')
            """)
        )
        conn.commit()
        result = conn.execute(
            text("SELECT source_id FROM source WHERE name = 'kaggle_french_opinions'")
        ).fetchone()
    
    source_id = result[0]

try:
    kaggle_path = DATA_RAW / 'kaggle_french_opinions'
    csv_files = list(kaggle_path.glob('**/opinions*.csv'))
    
    if csv_files:
        print(f" Found Kaggle CSV: {csv_files[0]}")
        df = pd.read_csv(csv_files[0], nrows=1000)  # Limit to 1000 rows
        
        articles = []
        for _, row in df.iterrows():
            articles.append({
                'title': str(row.get('title', row.get('review_title', 'Opinion')))[:500],
                'content': str(row.get('review', row.get('content', '')))[:1000],
                'url': str(row.get('url', ''))[:1000],
                'published_at': pd.to_datetime(row.get('date', datetime.now()), errors='coerce') or datetime.now()
            })
    else:
        print("  Kaggle CSV not found. Generating MOCK French opinions...")
        
        products = ['iPhone 14', 'Samsung TV', 'Nike Air Max', 'Dyson Vacuum', 'Nespresso Machine']
        articles = []
        
        for i in range(500):
            product = np.random.choice(products)
            rating = np.random.randint(1, 6)
            articles.append({
                'title': f"Opinion: {product} - {rating}/5 stars",
                'content': f"Avis utilisateur #{i}: Ce produit est {'excellent' if rating >= 4 else 'moyen'}. " + 
                          f"Qualité: {'très bonne' if rating >= 4 else 'acceptable'}, Service: {'rapide' if np.random.random() > 0.5 else 'lent'}",
                'url': f"https://kaggle.com/opinions/{i}",
                'published_at': datetime.now() - timedelta(days=np.random.randint(0, 365))
            })
    
    store_raw_data(source_id, 'kaggle_french_opinions', articles, datetime.now().strftime("%Y-%m-%d"))
    log_sync(source_id, 'kaggle_french_opinions', len(articles), 'success')
    
    print(f" Ingested {len(articles)} opinions from Kaggle")
except Exception as e:
    print(f" Error: {e}")
    log_sync(source_id, 'kaggle_french_opinions', 0, f'error: {str(e)}')

In [None]:
# SUMMARY

print("\n" + "="*60)
print(" INGESTION SUMMARY (03a)")
print("="*60)

with engine_raw.connect() as conn:
    sources = conn.execute(text("SELECT COUNT(*) FROM source")).fetchone()[0]
    articles = conn.execute(text("SELECT COUNT(*) FROM raw_data")).fetchone()[0]
    syncs = conn.execute(text("SELECT COUNT(*) FROM sync_log")).fetchone()[0]

print(f" Sources registered: {sources}")
print(f" Total articles ingested: {articles}")
print(f" Sync operations logged: {syncs}")

print("\n Next step: Run 03b_ingest_sources_media.ipynb")