In [1]:
# Method 1: Directly reading Parquet file from URL using Polars
import polars as pl

def nfl_pbp_direct() -> pl.DataFrame:
    """
    NFL pbp data from NFLFastR - Direct method
    """
    nfl_pbp_2024_url = "https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_2024.parquet"
    # Read the Parquet file directly from URL
    return pl.read_parquet(nfl_pbp_2024_url)

# Read data and count rows
df_direct = nfl_pbp_direct()
row_count_direct = df_direct.height  # Polars uses .height to get the number of rows
print(f"Row count (Direct method): {row_count_direct}")


Row count (Direct method): 8193


In [None]:
import dlt
from dlt.sources.helpers import requests
import os
from datetime import datetime, timedelta
import polars as pl
import duckdb
import json

# Configure your Bloombet API key
API_KEY = "pJFSaqshyXrGQDjGxzyeecu8i4uTwKnY"

# Starting datetime for the data retrieval: September 23rd, 2024 at 12am
start_time = datetime(2024, 9, 24, 0, 0, 0)

# Create an in-memory DuckDB database
con = duckdb.connect(database=':memory:')

# Example function to fetch data from the Bloombet API
def fetch_bloombet_data(sport, date):
    print(f"Starting API call for {sport} at {date.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Use DLT's requests client for resilient API requests
    response = requests.get(
        f'https://getbloombet.com/api/historical',
        params={
            'api_key': API_KEY,
            'sport': sport,
            'date': date.strftime('%Y-%m-%d %H:%M:%S')
        }
    )
    
    # Raise error if request fails
    response.raise_for_status()
    
    print(f"API call successful for {sport} at {date.strftime('%Y-%m-%d %H:%M:%S')}")
    return response.json()

# Function to run the process and store data in DuckDB
def run_bloombet_source(sport, start_time):
    table_name = "bloombet_nfl_data"
    current_time = start_time
    table_created = False
    
    while current_time <= datetime.now():  # Fetch data up to the current time
        print(f"Fetching data for {current_time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        try:
            # Fetch JSON data
            data = fetch_bloombet_data(sport, current_time)
            
            # Convert the JSON object to a Polars DataFrame
            df = pl.DataFrame(data)
            
            # On the first iteration, create the table based on the DataFrame schema
            if not table_created:
                con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df LIMIT 0")  # Create an empty table with correct schema
                table_created = True
            
            # Insert the Polars DataFrame into DuckDB
            con.execute(f"INSERT INTO {table_name} SELECT * FROM df")
            
            print(f"Data inserted for {current_time.strftime('%Y-%m-%d %H:%M:%S')}")
            
        except Exception as e:
            print(f"Failed to fetch data for {current_time.strftime('%Y-%m-%d %H:%M:%S')} due to {e}")
        
        # Move to the next hour
        current_time += timedelta(hours=1)

    # Print total number of rows and a sample of 5 rows from the DuckDB table
    result = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
    print(f"Total number of rows in {table_name}: {result}")
    sample = con.execute(f"SELECT * FROM {table_name} LIMIT 5").fetchdf()
    print("Sample of 5 rows:")
    print(sample)

# Run the process
run_bloombet_source(sport='nfl', start_time=start_time)


In [2]:
import bauplan

In [84]:
sql_query = """
SELECT 
    victim_age_group,
    victim_race,
    victim_sex,
    borough_name,
    location_of_occurrence,
    suspect_age_group,
    suspect_race,
    suspect_sex,
    offense_description,
    COUNT(*) AS crime_count
FROM 
    crime_table_renamed
WHERE 
    EXTRACT(YEAR FROM complaint_from_date) = 2023 AND
    EXTRACT(MONTH FROM complaint_from_date) = 12 AND
    (offense_description LIKE '%RAPE%' OR offense_description LIKE '%SEX CRIME%')
GROUP BY 
    victim_age_group,
    victim_race,
    victim_sex,
    borough_name,
    location_of_occurrence,
    suspect_age_group,
    suspect_race,
    suspect_sex,
    offense_description
ORDER BY 
    crime_count DESC
"""

In [85]:
# instantiate the sdk client 
client = bauplan.Client()
# pass the branch and the sql expression to the method query
current_branch = "main"

# run a query and get in return an arrow table
table = client.query(sql_query, max_rows=None, branch_name=current_branch)
# convert the arrow table into a Pandas DataFrame
df = table.to_pandas()
# display the Pandas DataFrame
df

Unnamed: 0,victim_age_group,victim_race,victim_sex,borough_name,location_of_occurrence,suspect_age_group,suspect_race,suspect_sex,offense_description,crime_count
0,25-44,WHITE,F,QUEENS,INSIDE,45-64,WHITE,M,RAPE,48
1,<18,WHITE HISPANIC,F,BROOKLYN,INSIDE,25-44,WHITE HISPANIC,M,RAPE,24
2,<18,ASIAN / PACIFIC ISLANDER,F,MANHATTAN,(null),<18,UNKNOWN,M,RAPE,24
3,45-64,WHITE HISPANIC,F,BRONX,INSIDE,45-64,BLACK HISPANIC,M,RAPE,24
4,<18,BLACK,F,BROOKLYN,INSIDE,25-44,BLACK,M,RAPE,24
...,...,...,...,...,...,...,...,...,...,...
103,<18,WHITE,F,QUEENS,INSIDE,18-24,WHITE,M,SEX CRIMES,1
104,45-64,BLACK HISPANIC,F,QUEENS,INSIDE,UNKNOWN,BLACK,M,SEX CRIMES,1
105,<18,ASIAN / PACIFIC ISLANDER,F,QUEENS,FRONT OF,45-64,ASIAN / PACIFIC ISLANDER,M,SEX CRIMES,1
106,<18,WHITE HISPANIC,M,BROOKLYN,INSIDE,25-44,BLACK,M,SEX CRIMES,1
