In [5]:
"""Ignite 2025 All-in-One Downloader + Fabric Transcript Analyzer

Combines the logic from:
- `Download_All_Slides.ipynb` (all sessions metadata + slides)
- `Download_Fabric_Videos.ipynb` (Fabric-session filtering + en-US VTT captions + Fabric OpenAI GPT analysis)

Intended to run inside a Microsoft Fabric notebook attached to a Lakehouse.

Notes:
- Parquet writing requires `pandas` + a parquet engine (e.g., `pyarrow`).
- Transcript analysis requires Fabric `synapse.ml.fabric` packages and a deployed model.
"""

# =============================================================================
# CONFIGURATION - Edit these values before running
# =============================================================================

# Lakehouse output path
LAKEHOUSE_FILES_PATH = "/lakehouse/default/Files/Ignite2025_All"

# Filter keywords: set to None or [] for ALL sessions, or list keywords to filter
# Examples:
#   FILTER_KEYWORDS = None                              # Download ALL sessions
#   FILTER_KEYWORDS = []                                # Download ALL sessions  
#   FILTER_KEYWORDS = ["fabric", "lakehouse"]          # Only Fabric-related
#   FILTER_KEYWORDS = ["azure ai", "copilot"]          # Only AI-related
#   FILTER_KEYWORDS = ["power bi", "semantic model"]   # Only Power BI-related
FILTER_KEYWORDS = None  # <-- Change this to filter sessions

# Download options
MAX_WORKERS_SLIDES = 3      # Parallel workers for PPTX downloads
SKIP_SLIDES = False         # Set True to skip slide deck downloads
SKIP_CAPTIONS = False       # Set True to skip VTT caption downloads  
SKIP_ANALYSIS = False       # Set True to skip transcript analysis

# Analysis options
MAX_SESSIONS_ANALYZE = None  # Set to a number (e.g., 5) to limit analysis for testing
DEPLOYMENT_NAME = "gpt-5"    # Fabric OpenAI deployment name
API_VERSION = "2024-08-01-preview"  # Fabric OpenAI API version
TRANSCRIPT_CHAR_LIMIT = 100_000  # Truncate transcripts longer than this

# Resume capability - automatically saves progress and resumes from failures
RESUME_FROM_EXISTING = True  # Skip sessions that were successfully analyzed (disable to reprocess all)
RETRY_FAILED = True  # Retry sessions that previously had errors
MAX_RETRIES_PER_SESSION = 2  # Max retry attempts per failed session
INITIAL_RETRY_DELAY = 2.0  # Initial delay between retries (seconds)
BASE_DELAY_BETWEEN_REQUESTS = 0.3  # Base delay between successful requests
SAVE_PROGRESS_EVERY = 10  # Save results to disk every N sessions (prevents data loss)

# =============================================================================
# END CONFIGURATION
# =============================================================================

from __future__ import annotations

import json
import os
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple

import requests


IGNITE_API = "https://api-v2.ignite.microsoft.com/api/session/all/en-US"


@dataclass(frozen=True)
class Paths:
    base: str

    @property
    def slides_dir(self) -> str:
        return f"{self.base}/slides"

    @property
    def metadata_dir(self) -> str:
        return f"{self.base}/metadata"

    @property
    def captions_dir(self) -> str:
        return f"{self.base}/english_captions"

    @property
    def analysis_dir(self) -> str:
        return f"{self.base}/analysis"


def _http_headers() -> Dict[str, str]:
    return {
        "Content-Type": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    }


def fetch_all_sessions(timeout_s: int = 60) -> List[Dict[str, Any]]:
    print("üì° Fetching Ignite 2025 sessions...")
    resp = requests.get(IGNITE_API, headers=_http_headers(), timeout=timeout_s)
    resp.raise_for_status()
    sessions = resp.json()
    if not isinstance(sessions, list):
        raise ValueError(f"Unexpected Ignite API response type: {type(sessions).__name__}")
    print(f"‚úÖ Retrieved {len(sessions)} sessions")
    return sessions


def save_json(path: str, data: Any, *, ensure_ascii: bool = False) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=ensure_ascii)


_INVALID_FILENAME_CHARS = re.compile(r"[^A-Za-z0-9 _\-\.]+")


def slugify_filename(text: str, *, max_len: int = 120) -> str:
    """Make a filesystem-friendly slug (conservative, cross-platform)."""
    text = (text or "").strip()
    if not text:
        return "untitled"
    text = _INVALID_FILENAME_CHARS.sub("", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.replace(" ", "-")
    return text[:max_len].rstrip("- ") or "untitled"


def make_session_basename(session_code: str, title: str) -> str:
    code = (session_code or "").strip() or "unknown"
    title_slug = slugify_filename(title)
    return f"{code}__{title_slug}"


def safe_get(obj: Any, *keys: Any, default: Any = None) -> Any:
    for key in keys:
        if isinstance(obj, dict):
            obj = obj.get(key, default)
        elif isinstance(obj, list) and isinstance(key, int) and len(obj) > key:
            obj = obj[key]
        else:
            return default
    return obj if obj is not None else default


def extract_session_metadata(session: Dict[str, Any]) -> Dict[str, Any]:
    # Extract speaker names - API provides comma-separated string in 'speakerNames'
    speaker_names_str = safe_get(session, "speakerNames", default="") or ""
    
    # Parse speaker names into structured format
    speakers = []
    if speaker_names_str:
        # Create list of speaker dictionaries from comma-separated string
        speakers = [
            {"fullName": name.strip(), "title": "", "company": ""} 
            for name in speaker_names_str.split(",") if name.strip()
        ]
    
    # Extract products from tags field (product field is empty in API)
    tags = safe_get(session, "tags", default=[]) or []
    products = []
    if tags:
        # Tags are in format [{'displayValue': 'AI', 'logicalValue': 'AI'}, ...]
        products = [tag.get('displayValue', tag.get('logicalValue', '')) for tag in tags if isinstance(tag, dict)]
    
    # Handle sessionType - can be object with displayValue/logicalValue or string
    session_type = safe_get(session, "sessionType", default="")
    if isinstance(session_type, dict):
        session_type = session_type.get("displayValue", session_type.get("logicalValue", ""))
    
    # Handle sessionLevel - API uses 'sessionLevel' array
    session_level_array = safe_get(session, "sessionLevel", default=[]) or []
    level = ""
    if isinstance(session_level_array, list) and len(session_level_array) > 0:
        first_level = session_level_array[0]
        if isinstance(first_level, dict):
            level = first_level.get("displayValue", first_level.get("logicalValue", ""))
    
    # Extract location - API uses 'location' field which combines venue/room
    location = safe_get(session, "location", default="") or ""
    venue = ""
    room = ""
    if location:
        # Location format: "Moscone West - Room 3004" or just "Moscone West"
        parts = location.split(" - ")
        venue = parts[0].strip()
        if len(parts) > 1:
            room = parts[1].strip()
    
    return {
        "session_id": safe_get(session, "sessionId", default=""),
        "session_code": safe_get(session, "sessionCode", default=""),
        "title": safe_get(session, "title", default=""),
        "description": safe_get(session, "description", default=""),
        "level": level,
        "session_type": session_type,
        "duration_minutes": safe_get(session, "durationInMinutes", default=0),
        "start_time": safe_get(session, "startDateTime", default=""),
        "end_time": safe_get(session, "endDateTime", default=""),
        "speakers": speakers,
        "speaker_names": speaker_names_str,
        "tags": products,
        "topics": safe_get(session, "topic", default=""),
        "learning_path": safe_get(session, "learningPath", default=[]),
        "slide_deck_url": safe_get(session, "slideDeck", default=""),
        "has_slides": bool(safe_get(session, "slideDeck", default="")),
        "video_url": safe_get(session, "onDemand", default=""),
        "has_video": bool(safe_get(session, "onDemand", default="")),
        "captions_url": safe_get(session, "captionFileLink", default=""),
        "location": location,
        "venue": venue,
        "room": room,
        "extracted_at": datetime.now().isoformat(),
    }


def save_metadata_outputs(paths: Paths, sessions_metadata: List[Dict[str, Any]]) -> str:
    """Save ONLY the structured session metadata as JSON."""
    json_path = f"{paths.metadata_dir}/sessions_metadata.json"
    save_json(json_path, sessions_metadata, ensure_ascii=False)
    print(f"üíæ Saved JSON metadata: {json_path}")
    return json_path


def download_file(url: str, filepath: str, session_code: str, timeout_s: int = 120) -> Dict[str, Any]:
    try:
        if os.path.exists(filepath):
            return {"session_code": session_code, "status": "exists", "path": filepath}

        resp = requests.get(url, headers=_http_headers(), stream=True, timeout=timeout_s)
        resp.raise_for_status()

        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

        return {"session_code": session_code, "status": "downloaded", "path": filepath}

    except Exception as e:
        return {"session_code": session_code, "status": "failed", "error": str(e)}


def download_all_slides(paths: Paths, sessions_metadata: List[Dict[str, Any]], max_workers: int) -> List[Dict[str, Any]]:
    slides_dir = paths.slides_dir
    os.makedirs(slides_dir, exist_ok=True)

    sessions_with_slides = [s for s in sessions_metadata if s.get("has_slides")]
    print(f"üì• Downloading {len(sessions_with_slides)} slide decks (workers={max_workers})")

    tasks: List[Tuple[str, str, str]] = []
    for session in sessions_with_slides:
        code = session.get("session_code") or session.get("session_id") or "unknown"
        title = session.get("title") or ""
        basename = make_session_basename(code, title)
        url = session.get("slide_deck_url") or ""
        if not url:
            continue
        filepath = f"{slides_dir}/{basename}.pptx"
        tasks.append((url, filepath, code))

    results: List[Dict[str, Any]] = []
    downloaded = failed = existed = 0

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(download_file, url, path, code): code for url, path, code in tasks}
        for future in as_completed(futures):
            result = future.result()
            results.append(result)
            status = result.get("status")
            if status == "downloaded":
                downloaded += 1
            elif status == "exists":
                existed += 1
            else:
                failed += 1
                print(f"   ‚ùå Slide failed: {result.get('session_code')} - {result.get('error', 'Unknown error')}")

            total_processed = downloaded + existed + failed
            if total_processed % 50 == 0:
                print(f"   üìä Slides progress: {total_processed}/{len(tasks)}")

    print("‚úÖ Slides complete")
    print(f"   Downloaded: {downloaded}")
    print(f"   Already existed: {existed}")
    print(f"   Failed: {failed}")

    return results


def extract_video_metadata(session: Dict[str, Any]) -> Dict[str, Any]:
    video_id = None
    video_url = session.get("onDemand", "") or ""
    if video_url:
        match = re.search(r"/video-nc/([a-f0-9-]+)", video_url)
        if match:
            video_id = match.group(1)

    return {
        "session_code": session.get("sessionCode", "") or "",
        "title": session.get("title", "") or "",
        "description": session.get("description", "") or "",
        "video_url": video_url,
        "video_id": video_id,
        "has_video": bool(video_url),
    }


def extract_medius_caption_urls(embed_url: str, timeout_s: int = 30) -> List[Dict[str, str]]:
    """Extract caption URLs from the Medius embed page."""
    try:
        resp = requests.get(embed_url, headers=_http_headers(), timeout=timeout_s)
        resp.raise_for_status()
        html = resp.text

        captions: List[Dict[str, str]] = []
        caption_matches = re.findall(
            r'"StreamUrl"\s*:\s*"(https://mediusdl\.event\.microsoft\.com[^"]+\.vtt[^"]*)"',
            html,
        )
        for cap_url in caption_matches:
            cap_url = cap_url.replace("\\u0026", "&")
            lang_match = re.search(r"Caption_([a-z]{2}-[A-Z]{2})\.vtt", cap_url)
            lang = lang_match.group(1) if lang_match else "unknown"
            captions.append({"url": cap_url, "language": lang})

        return captions

    except Exception:
        return []


def download_en_us_captions(paths: Paths, sessions_metadata: List[Dict[str, Any]]) -> Tuple[List[str], List[str]]:
    os.makedirs(paths.captions_dir, exist_ok=True)

    sessions_with_video = [s for s in sessions_metadata if s.get("has_video") and s.get("video_id")]
    print(f"üì• Downloading en-US VTT captions for {len(sessions_with_video)} sessions")

    downloaded: List[str] = []
    errors: List[str] = []

    for i, session in enumerate(sessions_with_video, 1):
        code = session.get("session_code") or ""
        if not code:
            errors.append("(missing session_code)")
            continue

        title = session.get("title") or ""
        basename = make_session_basename(code, title)
        vtt_path = f"{paths.captions_dir}/{basename}_en-US.vtt"
        if os.path.exists(vtt_path):
            downloaded.append(code)
            if i % 10 == 0:
                print(f"   [{i}/{len(sessions_with_video)}] {len(downloaded)} downloaded, {len(errors)} errors...")
            continue

        embed_url = session.get("video_url") or ""
        if not embed_url:
            errors.append(code)
            continue

        captions = extract_medius_caption_urls(embed_url)
        en_us_caption = next((c for c in captions if c.get("language") == "en-US"), None)
        if not en_us_caption:
            errors.append(code)
            if i % 10 == 0:
                print(f"   [{i}/{len(sessions_with_video)}] {len(downloaded)} downloaded, {len(errors)} errors...")
            continue

        try:
            resp = requests.get(en_us_caption["url"], headers=_http_headers(), timeout=30)
            if resp.status_code == 200 and "WEBVTT" in resp.text:
                with open(vtt_path, "w", encoding="utf-8") as f:
                    f.write(resp.text)
                downloaded.append(code)
            else:
                errors.append(code)
        except Exception:
            errors.append(code)

        if i % 10 == 0:
            print(f"   [{i}/{len(sessions_with_video)}] {len(downloaded)} downloaded, {len(errors)} errors...")

    print(f"‚úÖ Captions complete: {len(downloaded)} downloaded, {len(errors)} errors")
    if errors and len(errors) <= 5:
        print(f"   Failed session codes: {', '.join(errors)}")

    return downloaded, errors


def load_existing_results(results_path: str) -> Tuple[set[str], set[str], List[Dict[str, Any]]]:
    """Load existing results and return sets of successful/failed codes plus full results.
    
    Returns:
        (successful_codes, failed_codes, full_results)
    """
    successful: set[str] = set()
    failed: set[str] = set()
    full_results: List[Dict[str, Any]] = []
    
    if not os.path.exists(results_path):
        print(f"‚ÑπÔ∏è  No existing results file found: {results_path}")
        print("   Starting fresh analysis")
        return successful, failed, full_results
    
    try:
        with open(results_path, "r", encoding="utf-8") as f:
            full_results = json.load(f)
        
        for result in full_results:
            if isinstance(result, dict):
                code = result.get("session_code", "")
                if code:
                    if "error" in result:
                        failed.add(code)
                    else:
                        successful.add(code)
        
        print(f"üìä Loaded existing results from previous run:")
        print(f"   ‚úÖ {len(successful)} successful sessions")
        print(f"   ‚ùå {len(failed)} failed sessions")
        print(f"   üì¶ {len(full_results)} total records")
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not load existing results: {e}")
        print("   Starting fresh analysis")
    
    return successful, failed, full_results


def parse_vtt_file(vtt_path: str) -> str:
    with open(vtt_path, "r", encoding="utf-8") as f:
        content = f.read()

    content = re.sub(r"^WEBVTT.*?\n\n", "", content, flags=re.DOTALL)

    captions: List[str] = []
    for block in content.strip().split("\n\n"):
        lines = block.strip().split("\n")
        text_lines: List[str] = []
        for line in lines:
            if "-->" not in line and line.strip():
                clean_line = re.sub(r"<[^>]+>", "", line)
                text_lines.append(clean_line.strip())
        if text_lines:
            captions.append(" ".join(text_lines))

    return " ".join(captions)


def get_openai_config(deployment_name: str, api_version: str) -> Tuple[str, Dict[str, str]]:
    """Get OpenAI configuration from environment.

    Raises if required packages are unavailable.
    """
    from synapse.ml.fabric.service_discovery import get_fabric_env_config  # type: ignore
    from synapse.ml.fabric.token_utils import TokenUtils  # type: ignore

    fabric_env_config = get_fabric_env_config().fabric_env_config
    auth_header = TokenUtils().get_openai_auth_header()

    openai_base_host = fabric_env_config.ml_workload_endpoint + "cognitive/openai/openai/"
    service_url = f"{openai_base_host}deployments/{deployment_name}/chat/completions?api-version={api_version}"

    headers = {
        "Authorization": auth_header,
        "Content-Type": "application/json",
    }

    return service_url, headers


SYSTEM_PROMPT = """You are an expert analyst for Microsoft Ignite session transcripts.
Analyze and extract structured information.
Respond ONLY with valid JSON:
{
    \"summary\": \"2-3 sentence summary\",
    \"key_topics\": [\"topic1\", \"topic2\"],
    \"microsoft_features_mentioned\": [\"feature1\"],
    \"new_announcements\": [\"announcement1\"],
    \"demos_described\": [\"demo1\"],
    \"best_practices\": [\"practice1\"],
    \"target_audience\": \"description\",
    \"technical_level\": \"beginner|intermediate|advanced\",
    \"key_quotes\": [\"quote1\"],
    \"action_items\": [\"action1\"]
}"""


def analyze_vtt_with_openai(
    *,
    service_url: str,
    headers: Dict[str, str],
    session_code: str,
    session_title: str,
    vtt_path: str,
    transcript_char_limit: int,
    timeout_s: int = 120,
) -> Dict[str, Any]:
    transcript = parse_vtt_file(vtt_path)
    if len(transcript) > transcript_char_limit:
        transcript = transcript[:transcript_char_limit] + "\n\n[Truncated...]"

    payload = {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content": f"Session: {session_title}\nCode: {session_code}\n\nTranscript:\n{transcript}",
            },
        ]
    }

    resp = requests.post(service_url, headers=headers, json=payload, timeout=timeout_s)
    resp.raise_for_status()

    response_data = resp.json()
    content = response_data["choices"][0]["message"]["content"]

    try:
        result = json.loads(content)
    except json.JSONDecodeError:
        json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", content, re.DOTALL)
        if json_match:
            result = json.loads(json_match.group(1))
        else:
            result = {"raw_response": content, "error": "Failed to parse JSON"}

    result["session_code"] = session_code
    result["session_title"] = session_title
    result["analyzed_at"] = datetime.now().isoformat()

    return result


def analyze_all_vtt_files(
    *,
    paths: Paths,
    sessions_metadata_path: str,
    deployment_name: str,
    api_version: str,
    max_sessions_analyze: Optional[int],
    transcript_char_limit: int,
    resume_from_existing: bool,
    retry_failed: bool,
    max_retries: int,
    initial_retry_delay: float,
    base_delay: float,
    save_progress_every: int,
) -> List[Dict[str, Any]]:
    """Analyze VTT files with smart resume and incremental progress saving.
    
    This function:
    1. Loads any existing results from previous runs
    2. Skips already-successful sessions 
    3. Retries previously-failed sessions (if retry_failed=True)
    4. Saves progress incrementally every N sessions
    5. Merges new results with existing ones
    6. Automatically refreshes auth token every 30 minutes to prevent 401 errors
    """
    # Token refresh configuration
    TOKEN_REFRESH_INTERVAL = 30 * 60  # 30 minutes in seconds
    
    def get_fresh_credentials():
        """Get fresh OpenAI credentials and return (service_url, headers, timestamp)."""
        url, hdrs = get_openai_config(deployment_name, api_version)
        return url, hdrs, time.time()
    
    service_url, headers, token_obtained_at = get_fresh_credentials()
    print(f"üîë Authentication token obtained (auto-refresh every 30 minutes)")

    # Determine results file path
    results_json_path = f"{paths.analysis_dir}/sessions_analysis_full.json"
    
    # Load existing results (if resume is enabled)
    existing_results: List[Dict[str, Any]] = []
    successful_codes: set[str] = set()
    failed_codes: set[str] = set()
    
    if resume_from_existing:
        successful_codes, failed_codes, existing_results = load_existing_results(results_json_path)
    
    # Determine which sessions to skip
    skip_codes: set[str] = set()
    if resume_from_existing and not retry_failed:
        # Skip both successful and failed
        skip_codes = successful_codes | failed_codes
        print(f"‚è≠Ô∏è  Skipping {len(skip_codes)} already-processed sessions (successful + failed)")
    elif resume_from_existing and retry_failed:
        # Skip only successful, retry failed
        skip_codes = successful_codes
        print(f"‚è≠Ô∏è  Skipping {len(skip_codes)} successful sessions")
        print(f"üîÑ Will retry {len(failed_codes)} failed sessions")

    # Find VTT files to process
    vtt_files: Dict[str, str] = {}
    for filename in os.listdir(paths.captions_dir):
        if not filename.endswith("_en-US.vtt"):
            continue
        base = filename[: -len("_en-US.vtt")]
        session_code = base.split("__", 1)[0]
        
        # Skip if in exclude set
        if session_code in skip_codes:
            continue
            
        vtt_files[session_code] = f"{paths.captions_dir}/{filename}"

    if max_sessions_analyze is not None:
        vtt_files = dict(list(vtt_files.items())[: max_sessions_analyze])
    
    if not vtt_files:
        print("‚úÖ No VTT files to process - all sessions already completed!")
        return existing_results if existing_results else []

    # Load session titles from JSON
    session_titles: Dict[str, str] = {}
    if os.path.exists(sessions_metadata_path):
        try:
            with open(sessions_metadata_path, "r", encoding="utf-8") as f:
                sessions = json.load(f)
            for session in sessions:
                if isinstance(session, dict):
                    session_titles[session.get("session_code", "")] = session.get("title", "")
        except Exception as e:
            print(f"‚ö†Ô∏è Could not load sessions JSON: {e}")

    os.makedirs(paths.analysis_dir, exist_ok=True)

    print(f"üîÑ Analyzing {len(vtt_files)} VTT transcripts with OpenAI...")
    print(f"   Base delay between requests: {base_delay}s")
    print(f"   Max retries per session: {max_retries}")
    print()

    new_results: List[Dict[str, Any]] = []
    successful = 0
    failed = 0

    for i, (code, vtt_path) in enumerate(vtt_files.items(), 1):
        title = session_titles.get(code, code)
        is_retry = code in failed_codes
        retry_label = " (RETRY)" if is_retry else ""
        print(f"   [{i}/{len(vtt_files)}] {code}{retry_label}...", end=" ", flush=True)

        # Check if token needs refresh (every 30 minutes)
        time_since_token = time.time() - token_obtained_at
        if time_since_token >= TOKEN_REFRESH_INTERVAL:
            print(f"\nüîÑ Refreshing authentication token (last refresh: {int(time_since_token/60)} minutes ago)...")
            service_url, headers, token_obtained_at = get_fresh_credentials()
            print(f"‚úÖ Token refreshed! Continuing analysis...")
            print(f"   [{i}/{len(vtt_files)}] {code}{retry_label}...", end=" ", flush=True)

        # Try with exponential backoff
        last_error = None
        for attempt in range(max_retries):
            try:
                result = analyze_vtt_with_openai(
                    service_url=service_url,
                    headers=headers,
                    session_code=code,
                    session_title=title,
                    vtt_path=vtt_path,
                    transcript_char_limit=transcript_char_limit,
                )
                new_results.append(result)
                successful += 1
                level = result.get('technical_level', 'N/A')
                retry_info = f" (attempt {attempt + 1})" if attempt > 0 else ""
                print(f"‚úÖ {level}{retry_info}")
                last_error = None
                break  # Success!
                
            except Exception as e:
                last_error = e
                if attempt < max_retries - 1:
                    # Exponential backoff
                    delay = initial_retry_delay * (2 ** attempt)
                    print(f"‚ö†Ô∏è  Attempt {attempt + 1} failed, retrying in {delay}s...", end=" ", flush=True)
                    time.sleep(delay)
                else:
                    # Final failure
                    error_msg = str(e)[:60]
                    print(f"‚ùå Failed after {max_retries} attempts: {error_msg}")
        
        # If all retries failed, record the error
        if last_error is not None:
            new_results.append({
                "session_code": code,
                "session_title": title,
                "error": str(last_error),
                "attempts": max_retries,
                "analyzed_at": datetime.now().isoformat()
            })
            failed += 1

        # Delay between requests
        time.sleep(base_delay)
        
        # Save progress incrementally to prevent data loss
        if (i % save_progress_every == 0) or (i == len(vtt_files)):
            # Merge current progress with existing results
            processed_codes = {r["session_code"] for r in new_results}
            kept_existing = [r for r in existing_results if r.get("session_code") not in processed_codes]
            current_full_results = kept_existing + new_results
            
            # Save to disk
            with open(results_json_path, "w", encoding="utf-8") as f:
                json.dump(current_full_results, f, indent=2, ensure_ascii=False)
            
            if i < len(vtt_files):
                print(f"   üíæ Progress saved ({len(current_full_results)} total sessions)")

    print()
    print(f"‚úÖ Analysis complete! {successful} successful, {failed} failed")
    print()
    
    # Final merge already done during incremental saves - just get the latest
    final_results = new_results
    if resume_from_existing and existing_results:
        processed_codes = {r["session_code"] for r in new_results}
        kept_existing = [r for r in existing_results if r.get("session_code") not in processed_codes]
        final_results = kept_existing + new_results
        print(f"üì¶ Final database: {len(kept_existing)} existing + {len(new_results)} new = {len(final_results)} total")
    else:
        print(f"üì¶ Final database: {len(final_results)} sessions")
    
    # Save final Parquet
    try:
        import pandas as pd

        # Flatten lists in results for parquet compatibility
        flattened: List[Dict[str, Any]] = []
        for r in final_results:
            flat = r.copy()
            for key, value in flat.items():
                if isinstance(value, list):
                    flat[key] = json.dumps(value)
            flattened.append(flat)

        df = pd.DataFrame(flattened)
        parquet_path = f"{paths.analysis_dir}/sessions_analysis.parquet"
        df.to_parquet(parquet_path, index=False)
        print(f"üíæ Saved Parquet: {parquet_path}")

    except ImportError:
        print("‚ö†Ô∏è  pandas not available - skipping Parquet output")
    except Exception as e:
        print(f"‚ö†Ô∏è  Failed to write analysis Parquet: {e}")
    
    # JSON already saved incrementally, just confirm final version is written
    with open(results_json_path, "w", encoding="utf-8") as f:
        json.dump(final_results, f, indent=2, ensure_ascii=False)
    print(f"üíæ Saved JSON: {results_json_path}")
    
    print()
    print(f"üéØ Analysis Summary:")
    print(f"   This run: {successful} successful, {failed} failed")
    print(f"   Total in database: {len(final_results)} sessions")

    return final_results


def write_download_report(paths: Paths, sessions_metadata: List[Dict[str, Any]], slide_results: List[Dict[str, Any]]) -> str:
    report = {
        "download_date": datetime.now().isoformat(),
        "total_sessions": len(sessions_metadata),
        "sessions_with_slides": sum(1 for s in sessions_metadata if s.get("has_slides")),
        "sessions_with_video": sum(1 for s in sessions_metadata if s.get("has_video")),
        "slides_downloaded": sum(1 for r in slide_results if r.get("status") == "downloaded"),
        "slides_already_existed": sum(1 for r in slide_results if r.get("status") == "exists"),
        "slides_failed": sum(1 for r in slide_results if r.get("status") == "failed"),
        "failed_sessions": [r for r in slide_results if r.get("status") == "failed"],
        "output_directory": paths.base,
    }

    report_path = f"{paths.base}/download_report.json"
    save_json(report_path, report, ensure_ascii=False)
    return report_path


def run():
    """Main execution function - uses configuration variables defined at the top."""
    
    paths = Paths(base=LAKEHOUSE_FILES_PATH)
    for d in [paths.metadata_dir, paths.slides_dir, paths.captions_dir, paths.analysis_dir]:
        os.makedirs(d, exist_ok=True)

    print(f"üìÅ Output directory: {LAKEHOUSE_FILES_PATH}")
    
    sessions = fetch_all_sessions()

    # Build filter function based on FILTER_KEYWORDS config
    filter_keywords = FILTER_KEYWORDS
    if filter_keywords is not None and len(filter_keywords) == 0:
        filter_keywords = None

    def session_matches_filter(session: Dict[str, Any]) -> bool:
        """Return True if session matches any filter keyword, or if no filter is set."""
        if filter_keywords is None:
            return True
        title = (session.get("title") or "").lower()
        description = (session.get("description") or "").lower()
        products = " ".join([str(p).lower() for p in (session.get("products") or [])])
        searchable = f"{title} {description} {products}"
        return any(kw.lower() in searchable for kw in filter_keywords)

    # Apply filter to raw sessions for downstream processing
    if filter_keywords:
        filtered_sessions = [s for s in sessions if session_matches_filter(s)]
        print(f"üîç Filter applied: {filter_keywords}")
        print(f"   Matched {len(filtered_sessions)} of {len(sessions)} sessions")
    else:
        filtered_sessions = sessions
        print("üîç No filter applied ‚Äî processing all sessions")

    # Extract metadata for ALL sessions (for full metadata file)
    all_sessions_metadata = [extract_session_metadata(s) for s in sessions]
    print("üîÑ Extracted structured metadata for all sessions")
    print(f"   Total sessions: {len(all_sessions_metadata)}")
    print(f"   Sessions with slides: {sum(1 for s in all_sessions_metadata if s.get('has_slides'))}")
    print(f"   Sessions with video: {sum(1 for s in all_sessions_metadata if s.get('has_video'))}")

    save_metadata_outputs(paths, all_sessions_metadata)

    # For downloads, use filtered sessions
    filtered_metadata = [extract_session_metadata(s) for s in filtered_sessions]

    slide_results: List[Dict[str, Any]] = []
    if not SKIP_SLIDES:
        slide_results = download_all_slides(paths, filtered_metadata, max_workers=MAX_WORKERS_SLIDES)
    else:
        print("‚è≠Ô∏è Skipping slide downloads (SKIP_SLIDES=True)")

    report_path = write_download_report(paths, all_sessions_metadata, slide_results)
    print(f"üìä Saved download report: {report_path}")

    if SKIP_CAPTIONS:
        print("‚è≠Ô∏è Skipping VTT caption downloads (SKIP_CAPTIONS=True)")
        return

    # For VTT captions: apply the same filter
    filtered_video_metadata = [extract_video_metadata(s) for s in filtered_sessions]

    filtered_json_path = f"{paths.metadata_dir}/filtered_sessions.json"
    save_json(filtered_json_path, filtered_video_metadata, ensure_ascii=False)

    print(f"üéØ Filtered sessions for VTT download: {len(filtered_video_metadata)}")
    print(f"   With video: {sum(1 for s in filtered_video_metadata if s.get('has_video'))}")
    print(f"üíæ Saved filtered session metadata: {filtered_json_path}")

    download_en_us_captions(paths, filtered_video_metadata)

    if SKIP_ANALYSIS:
        print("‚è≠Ô∏è Skipping transcript analysis (SKIP_ANALYSIS=True)")
        return

    # Transcript analysis requires packages
    try:
        analyze_all_vtt_files(
            paths=paths,
            sessions_metadata_path=filtered_json_path,
            deployment_name=DEPLOYMENT_NAME,
            api_version=API_VERSION,
            max_sessions_analyze=MAX_SESSIONS_ANALYZE,
            transcript_char_limit=TRANSCRIPT_CHAR_LIMIT,
            resume_from_existing=RESUME_FROM_EXISTING,
            retry_failed=RETRY_FAILED,
            max_retries=MAX_RETRIES_PER_SESSION,
            initial_retry_delay=INITIAL_RETRY_DELAY,
            base_delay=BASE_DELAY_BETWEEN_REQUESTS,
            save_progress_every=SAVE_PROGRESS_EVERY,
        )

    except ModuleNotFoundError as e:
        print(
            "‚ùå Transcript analysis requires synapse.ml packages. "
            "Run this inside Microsoft Fabric, or set SKIP_ANALYSIS=True.\n"
            f"Details: {e}"
        )

    print("\n‚úÖ All done!")


# Run the script
run()

üìÅ Output directory: /lakehouse/default/Files/Ignite2025_All
üì° Fetching Ignite 2025 sessions...
‚úÖ Retrieved 1090 sessions
üîç No filter applied ‚Äî processing all sessions
üîÑ Extracted structured metadata for all sessions
   Total sessions: 1090
   Sessions with slides: 289
   Sessions with video: 502
üíæ Saved JSON metadata: /lakehouse/default/Files/Ignite2025_All/metadata/sessions_metadata.json
üì• Downloading 289 slide decks (workers=3)
   üìä Slides progress: 50/289
   üìä Slides progress: 100/289
   üìä Slides progress: 150/289
   üìä Slides progress: 200/289
   üìä Slides progress: 250/289
‚úÖ Slides complete
   Downloaded: 0
   Already existed: 289
   Failed: 0
üìä Saved download report: /lakehouse/default/Files/Ignite2025_All/download_report.json
üéØ Filtered sessions for VTT download: 1090
   With video: 502
üíæ Saved filtered session metadata: /lakehouse/default/Files/Ignite2025_All/metadata/filtered_sessions.json
üì• Downloading en-US VTT captions for 50