In [1]:
# Method 1: Directly reading Parquet file from URL using Polars
import polars as pl

def nfl_pbp_direct() -> pl.DataFrame:
    """
    NFL pbp data from NFLFastR - Direct method
    """
    nfl_pbp_2024_url = "https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_2024.parquet"
    # Read the Parquet file directly from URL
    return pl.read_parquet(nfl_pbp_2024_url)

# Read data and count rows
df_direct = nfl_pbp_direct()
row_count_direct = df_direct.height  # Polars uses .height to get the number of rows
print(f"Row count (Direct method): {row_count_direct}")


Row count (Direct method): 8193


In [None]:
import dlt
from dlt.sources.helpers import requests
import os
from datetime import datetime, timedelta
import polars as pl
import duckdb
import json

# Configure your Bloombet API key
API_KEY = "pJFSaqshyXrGQDjGxzyeecu8i4uTwKnY"

# Starting datetime for the data retrieval: September 23rd, 2024 at 12am
start_time = datetime(2024, 9, 24, 0, 0, 0)

# Create an in-memory DuckDB database
con = duckdb.connect(database=':memory:')

# Example function to fetch data from the Bloombet API
def fetch_bloombet_data(sport, date):
    print(f"Starting API call for {sport} at {date.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Use DLT's requests client for resilient API requests
    response = requests.get(
        f'https://getbloombet.com/api/historical',
        params={
            'api_key': API_KEY,
            'sport': sport,
            'date': date.strftime('%Y-%m-%d %H:%M:%S')
        }
    )
    
    # Raise error if request fails
    response.raise_for_status()
    
    print(f"API call successful for {sport} at {date.strftime('%Y-%m-%d %H:%M:%S')}")
    return response.json()

# Function to run the process and store data in DuckDB
def run_bloombet_source(sport, start_time):
    table_name = "bloombet_nfl_data"
    current_time = start_time
    table_created = False
    
    while current_time <= datetime.now():  # Fetch data up to the current time
        print(f"Fetching data for {current_time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        try:
            # Fetch JSON data
            data = fetch_bloombet_data(sport, current_time)
            
            # Convert the JSON object to a Polars DataFrame
            df = pl.DataFrame(data)
            
            # On the first iteration, create the table based on the DataFrame schema
            if not table_created:
                con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df LIMIT 0")  # Create an empty table with correct schema
                table_created = True
            
            # Insert the Polars DataFrame into DuckDB
            con.execute(f"INSERT INTO {table_name} SELECT * FROM df")
            
            print(f"Data inserted for {current_time.strftime('%Y-%m-%d %H:%M:%S')}")
            
        except Exception as e:
            print(f"Failed to fetch data for {current_time.strftime('%Y-%m-%d %H:%M:%S')} due to {e}")
        
        # Move to the next hour
        current_time += timedelta(hours=1)

    # Print total number of rows and a sample of 5 rows from the DuckDB table
    result = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
    print(f"Total number of rows in {table_name}: {result}")
    sample = con.execute(f"SELECT * FROM {table_name} LIMIT 5").fetchdf()
    print("Sample of 5 rows:")
    print(sample)

# Run the process
run_bloombet_source(sport='nfl', start_time=start_time)


In [12]:
import dlt
from dlt.sources.helpers import requests
import os
from datetime import datetime, timedelta
import polars as pl
import json

# Configure your Bloombet API key
API_KEY = "pJFSaqshyXrGQDjGxzyeecu8i4uTwKnY"

# Starting datetime for the data retrieval: September 23rd, 2024 at 12am
start_time = datetime(2024, 9, 23, 0, 0, 0)

# Example function to fetch data from the Bloombet API
def fetch_bloombet_data(sport, date):
    print(f"Starting API call for {sport} at {date.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Use DLT's requests client for resilient API requests
    response = requests.get(
        f'https://getbloombet.com/api/historical',
        params={
            'api_key': API_KEY,
            'sport': sport,
            'date': date.strftime('%Y-%m-%d %H:%M:%S')
        }
    )
    
    # Raise error if request fails
    response.raise_for_status()
    
    print(f"API call successful for {sport} at {date.strftime('%Y-%m-%d %H:%M:%S')}")
    return response.json()

# Function to run the process and aggregate all results into one Polars DataFrame
def run_bloombet_source(sport, start_time):
    current_time = start_time
    
    # Initialize an empty Polars DataFrame for aggregation
    aggregated_df = pl.DataFrame()

    while current_time <= datetime.now():  # Fetch data up to the current time
        print(f"Fetching data for {current_time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        try:
            # Fetch JSON data
            data = fetch_bloombet_data(sport, current_time)
            
            # Convert the JSON object to a Polars DataFrame
            df = pl.DataFrame(data)
            
            # Append the individual DataFrame to the aggregated DataFrame
            aggregated_df = pl.concat([aggregated_df, df], rechunk=True)
            
            # Drop the individual DataFrame to conserve memory
            del df
            
            print(f"Data for {current_time.strftime('%Y-%m-%d %H:%M:%S')} added to aggregated dataframe")
            
        except Exception as e:
            print(f"Failed to fetch data for {current_time.strftime('%Y-%m-%d %H:%M:%S')} due to {e}")
        
        # Move to the next hour
        current_time += timedelta(hours=1)

    # Print total number of rows and a sample of 5 rows from the aggregated DataFrame
    print(f"Total number of rows in aggregated DataFrame: {aggregated_df.height}")
    print("Sample of 5 rows:")
    print(aggregated_df.head(5))

# Run the process
run_bloombet_source(sport='nfl', start_time=start_time)


Fetching data for 2024-09-23 00:00:00
Starting API call for nfl at 2024-09-23 00:00:00
API call successful for nfl at 2024-09-23 00:00:00
Data for 2024-09-23 00:00:00 added to aggregated dataframe
Fetching data for 2024-09-23 01:00:00
Starting API call for nfl at 2024-09-23 01:00:00
API call successful for nfl at 2024-09-23 01:00:00
Data for 2024-09-23 01:00:00 added to aggregated dataframe
Fetching data for 2024-09-23 02:00:00
Starting API call for nfl at 2024-09-23 02:00:00
API call successful for nfl at 2024-09-23 02:00:00
Data for 2024-09-23 02:00:00 added to aggregated dataframe
Fetching data for 2024-09-23 03:00:00
Starting API call for nfl at 2024-09-23 03:00:00
API call successful for nfl at 2024-09-23 03:00:00
Data for 2024-09-23 03:00:00 added to aggregated dataframe
Fetching data for 2024-09-23 04:00:00
Starting API call for nfl at 2024-09-23 04:00:00
API call successful for nfl at 2024-09-23 04:00:00
Data for 2024-09-23 04:00:00 added to aggregated dataframe
Fetching data f

In [1]:
import dlt
from dlt.sources.helpers import requests
import os
from datetime import datetime, timedelta
import polars as pl
import json
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Configure your Bloombet API key from environment variables
API_KEY = os.getenv("BLOOMBET_API_KEY")

# Starting datetime for the data retrieval: September 23rd, 2024 at 12am
start_time = datetime(2024, 9, 23, 0, 0, 0)

# Example function to fetch data from the Bloombet API
def fetch_bloombet_data(sport, date):
    print(f"Starting API call for {sport} at {date.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Use DLT's requests client for resilient API requests
    response = requests.get(
        f'https://getbloombet.com/api/historical',
        params={
            'api_key': API_KEY,
            'sport': sport,
            'date': date.strftime('%Y-%m-%d %H:%M:%S')
        }
    )
    
    # Raise error if request fails
    response.raise_for_status()
    
    print(f"API call successful for {sport} at {date.strftime('%Y-%m-%d %H:%M:%S')}")
    return response.json()

# Function to run the process and aggregate all results into one Polars DataFrame
def run_bloombet_source(sport, start_time):
    current_time = start_time
    
    # Initialize an empty Polars DataFrame for aggregation
    aggregated_df = pl.DataFrame()

    while current_time <= datetime.now():  # Fetch data up to the current time
        print(f"Fetching data for {current_time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        try:
            # Fetch JSON data
            data = fetch_bloombet_data(sport, current_time)
            
            # Convert the JSON object to a Polars DataFrame
            df = pl.DataFrame(data)
            
            # Append the individual DataFrame to the aggregated DataFrame
            aggregated_df = pl.concat([aggregated_df, df], rechunk=True)
            
            # Drop the individual DataFrame to conserve memory
            del df
            
            print(f"Data for {current_time.strftime('%Y-%m-%d %H:%M:%S')} added to aggregated dataframe")
            
        except Exception as e:
            print(f"Failed to fetch data for {current_time.strftime('%Y-%m-%d %H:%M:%S')} due to {e}")
        
        # Move to the next hour
        current_time += timedelta(hours=1)

    # Print total number of rows and a sample of 5 rows from the aggregated DataFrame
    print(f"Total number of rows in aggregated DataFrame: {aggregated_df.height}")
    print("Sample of 5 rows:")
    print(aggregated_df.head(5))

# Run the process
run_bloombet_source(sport='nfl', start_time=start_time)


Fetching data for 2024-09-23 00:00:00
Starting API call for nfl at 2024-09-23 00:00:00
API call successful for nfl at 2024-09-23 00:00:00
Data for 2024-09-23 00:00:00 added to aggregated dataframe
Fetching data for 2024-09-23 01:00:00
Starting API call for nfl at 2024-09-23 01:00:00
API call successful for nfl at 2024-09-23 01:00:00
Data for 2024-09-23 01:00:00 added to aggregated dataframe
Fetching data for 2024-09-23 02:00:00
Starting API call for nfl at 2024-09-23 02:00:00
API call successful for nfl at 2024-09-23 02:00:00
Data for 2024-09-23 02:00:00 added to aggregated dataframe
Fetching data for 2024-09-23 03:00:00
Starting API call for nfl at 2024-09-23 03:00:00
API call successful for nfl at 2024-09-23 03:00:00
Data for 2024-09-23 03:00:00 added to aggregated dataframe
Fetching data for 2024-09-23 04:00:00
Starting API call for nfl at 2024-09-23 04:00:00
API call successful for nfl at 2024-09-23 04:00:00
Data for 2024-09-23 04:00:00 added to aggregated dataframe
Fetching data f