# CFBD Data Ingestion: REST vs GraphQL

This notebook demonstrates how to fetch college football data using both REST and GraphQL APIs from CollegeFootballData.com.

**Key Topics:**
- Comparing REST and GraphQL fetch speeds
- Advanced GraphQL querying with nested data
- Field mapping (camelCase ‚Üí snake_case)
- Configuration and authentication setup

**Requirements:**
- `CFBD_API_KEY` environment variable
- For GraphQL: Patreon Tier 3+ subscription


## Configuration

**Environment Variables:**
- `CFBD_API_KEY`: Your CollegeFootballData.com API key (required)
- GraphQL access requires **Patreon Tier 3+ subscription**

**How to verify GraphQL access:**
1. Check your Patreon subscription tier at collegefootballdata.com
2. Ensure your API key has Tier 3+ permissions
3. GraphQL will automatically fall back to REST if unavailable

**Fallback behavior:**
- If GraphQL client initialization fails ‚Üí uses REST API
- If GraphQL query fails ‚Üí automatically falls back to REST
- All data is normalized to snake_case for ML model compatibility


In [None]:
import os
import sys
import time
from pathlib import Path
import pandas as pd

# Add project root to path
project_root = Path(__file__).parent.parent.parent if '__file__' in globals() else Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Import GraphQL client
try:
    from src.data_sources.cfbd_graphql import CFBDGraphQLClient
    GQL_AVAILABLE = True
except ImportError:
    CFBDGraphQLClient = None
    GQL_AVAILABLE = False
    print("‚ö†Ô∏è GraphQL client not available - will use REST API only")

# Import REST client
from cfbd import Configuration, ApiClient, GamesApi

# Get API key
api_key = os.environ.get("CFBD_API_KEY") or os.environ.get("CFBD_API_TOKEN")
if not api_key:
    raise ValueError("CFBD_API_KEY environment variable required")

# Initialize GraphQL client (if available)
graphql_client = None
graphql_available = False

if GQL_AVAILABLE and CFBDGraphQLClient is not None:
    try:
        graphql_client = CFBDGraphQLClient(api_key=api_key, host="production")
        # Test connectivity
        connectivity_test = graphql_client.query("query { __typename }", {})
        if connectivity_test:
            graphql_available = True
            print("‚úÖ GraphQL client initialized (PRIMARY METHOD)")
            print("‚úÖ GraphQL connectivity test passed")
        else:
            print("‚ö†Ô∏è GraphQL connectivity test failed - using REST as fallback")
            graphql_available = False
    except Exception as e:
        print(f"‚ö†Ô∏è GraphQL client initialization failed: {e}")
        print("‚ö†Ô∏è Falling back to REST API (FALLBACK METHOD)")
        graphql_available = False
        graphql_client = None
else:
    graphql_client = None
    graphql_available = False
    print("‚ö†Ô∏è GraphQL client not available - using REST API (FALLBACK METHOD)")
    print("   (GraphQL requires Patreon Tier 3+ access)")

# Initialize REST client
configuration = Configuration()
configuration.access_token = api_key
configuration.host = "https://api.collegefootballdata.com"
rest_client = ApiClient(configuration)
games_api = GamesApi(rest_client)

print(f"\nConfiguration:")
print(f"  Method: {'GraphQL (PRIMARY)' if graphql_available else 'REST (FALLBACK)'}")
print(f"  Season: 2025, Week: 12")


## Comparing Fetch Speeds: REST vs GraphQL

This section demonstrates the performance differences between REST and GraphQL APIs.

**When to use REST:**
- Simple data fetching needs
- No Patreon subscription
- Bulk historical data
- Standard use cases

**When to use GraphQL:**
- Need specific fields only (reduces payload size)
- Nested data relationships (recruiting + game data)
- Real-time subscriptions (WebSocket support)
- Patreon Tier 3+ access
- Advanced querying requirements


In [None]:
# Compare REST vs GraphQL fetch speeds
season = 2025
week = 12

# REST API fetch
print("Fetching via REST API...")
rest_start = time.time()
try:
    rest_games = games_api.get_games(year=season, week=week)
    rest_time = time.time() - rest_start
    print(f"‚úÖ REST: Fetched {len(rest_games)} games in {rest_time:.3f}s")
    rest_df = pd.DataFrame([g.to_dict() for g in rest_games])
except Exception as e:
    print(f"‚ùå REST fetch failed: {e}")
    rest_time = None
    rest_df = pd.DataFrame()

# GraphQL API fetch (if available)
if graphql_available and graphql_client:
    print("\nFetching via GraphQL API...")
    gql_start = time.time()
    try:
        gql_result = graphql_client.get_scoreboard(season=season, week=week)
        gql_time = time.time() - gql_start
        if gql_result and "game" in gql_result:
            gql_games = gql_result["game"]
            print(f"‚úÖ GraphQL: Fetched {len(gql_games)} games in {gql_time:.3f}s")
            
            # Convert to DataFrame (with field mapping)
            gql_df = pd.json_normalize(gql_games)
            # Map camelCase to snake_case
            column_mapping = {
                "homeTeam": "home_team",
                "awayTeam": "away_team",
                "homePoints": "home_points",
                "awayPoints": "away_points",
                "seasonType": "season_type",
                "startDate": "start_date",
            }
            gql_df = gql_df.rename(columns=column_mapping)
        else:
            print("‚ö†Ô∏è GraphQL returned no data")
            gql_time = None
            gql_df = pd.DataFrame()
    except Exception as e:
        print(f"‚ö†Ô∏è GraphQL fetch failed: {e} - falling back to REST")
        gql_time = None
        gql_df = pd.DataFrame()
else:
    print("\n‚ö†Ô∏è GraphQL not available - skipping comparison")
    gql_time = None
    gql_df = pd.DataFrame()

# Performance comparison
if rest_time and gql_time:
    print(f"\nüìä Performance Comparison:")
    print(f"  REST:   {rest_time:.3f}s")
    print(f"  GraphQL: {gql_time:.3f}s")
    if gql_time < rest_time:
        speedup = ((rest_time - gql_time) / rest_time) * 100
        print(f"  GraphQL is {speedup:.1f}% faster")
    else:
        slowdown = ((gql_time - rest_time) / rest_time) * 100
        print(f"  GraphQL is {slowdown:.1f}% slower")
elif rest_time:
    print(f"\nüìä REST Performance: {rest_time:.3f}s")
elif gql_time:
    print(f"\nüìä GraphQL Performance: {gql_time:.3f}s")


## Advanced Querying

GraphQL allows you to request specific fields and nested data that are difficult to get via REST. This section demonstrates advanced querying patterns.


In [None]:
# Example 1: Fetch only specific fields (reduces payload size)
if graphql_available and graphql_client:
    print("Example 1: Fetching only essential fields...")
    minimal_query = """
    query MinimalScoreboard($season: Int!, $week: smallint) {
      game(
        where: {
          season: { _eq: $season }
          week: { _eq: $week }
        }
        limit: 5
      ) {
        id
        homeTeam
        awayTeam
        homePoints
        awayPoints
      }
    }
    """
    try:
        minimal_result = graphql_client.query(minimal_query, {"season": 2025, "week": 12})
        if minimal_result and "game" in minimal_result:
            print(f"‚úÖ Fetched {len(minimal_result['game'])} games with minimal fields")
            print("   Fields: id, homeTeam, awayTeam, homePoints, awayPoints")
            print(f"   Sample: {minimal_result['game'][0] if minimal_result['game'] else 'No games'}")
    except Exception as e:
        print(f"‚ö†Ô∏è Minimal query failed: {e}")
else:
    print("‚ö†Ô∏è GraphQL not available for advanced querying")


In [None]:
# Example 2: Nested recruiting data query (hard to get via REST)
if graphql_available and graphql_client:
    print("Example 2: Fetching recruiting data for a team...")
    try:
        recruiting_result = graphql_client.get_recruits(season=2025, team="Ohio State", limit=5)
        if recruiting_result and "recruit" in recruiting_result:
            recruits = recruiting_result["recruit"]
            print(f"‚úÖ Fetched {len(recruits)} recruits for Ohio State")
            if recruits:
                sample = recruits[0]
                print(f"   Sample recruit: {sample.get('name', 'N/A')} - {sample.get('stars', 'N/A')} stars")
                print(f"   Position: {sample.get('position', {}).get('position', 'N/A') if isinstance(sample.get('position'), dict) else 'N/A'}")
        else:
            print("‚ö†Ô∏è No recruiting data returned")
    except Exception as e:
        print(f"‚ö†Ô∏è Recruiting query failed: {e}")
else:
    print("‚ö†Ô∏è GraphQL not available for recruiting queries")


In [None]:
# Example 3: Combined game + recruiting data in single query
if graphql_available and graphql_client:
    print("Example 3: Combining game and recruiting data in one query...")
    combined_query = """
    query GameWithRecruiting($season: Int!, $week: smallint, $team: String!) {
      game(
        where: {
          season: { _eq: $season }
          week: { _eq: $week }
          homeTeam: { _eq: $team }
        }
        limit: 1
      ) {
        id
        homeTeam
        awayTeam
        homePoints
        awayPoints
      }
      recruit(
        where: {
          year: { _eq: $season }
          college: { school: { _eq: $team } }
        }
        limit: 3
        order_by: { rating: desc }
      ) {
        name
        stars
        rating
        position {
          position
        }
      }
    }
    """
    try:
        combined_result = graphql_client.query(
            combined_query,
            {"season": 2025, "week": 12, "team": "Ohio State"}
        )
        if combined_result:
            games = combined_result.get("game", [])
            recruits = combined_result.get("recruit", [])
            print(f"‚úÖ Combined query results:")
            print(f"   Games: {len(games)}")
            print(f"   Top recruits: {len(recruits)}")
            if games:
                print(f"   Game: {games[0].get('homeTeam')} vs {games[0].get('awayTeam')}")
            if recruits:
                print(f"   Top recruit: {recruits[0].get('name')} ({recruits[0].get('stars')} stars)")
    except Exception as e:
        print(f"‚ö†Ô∏è Combined query failed: {e}")
else:
    print("‚ö†Ô∏è GraphQL not available for combined queries")
