[Reference](https://towardsdatascience.com/sql-on-pandas-usign-duckdb-f7cd238a0a5a)

In [1]:
pip install duckdb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting duckdb
  Downloading duckdb-0.7.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-0.7.1


In [2]:
import pandas as pd
import numpy as np

# Define the number of rows in the dataset
num_rows = 10000000

# Generate a random longitude for each row
pickup_longitude = np.random.uniform(low=-38.0, high=-94.0, size=num_rows)

# Generate a random trip duration for each row
trip_duration = np.random.normal(loc=10, scale=5, size=num_rows)

# Create a DataFrame with the pickup longitude and trip duration columns
df = pd.DataFrame(
    {"pickup_longitude": pickup_longitude, "trip_duration": trip_duration}
)

In [4]:
import time

def timing_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time} seconds to run.")
        return result
    return wrapper

In [5]:
@timing_decorator
def find_avg_trip_duration_in_the_west():
    return df[df['pickup_longitude'] < -73.95]['trip_duration'].mean()

find_avg_trip_duration_in_the_west()

Function find_avg_trip_duration_in_the_west took 0.23928236961364746 seconds to run.


9.998598864949027

In [7]:
import duckdb

@timing_decorator
def find_avg_trip_duration_in_the_west():
    return duckdb.execute(
        'SELECT AVG(trip_duration) FROM df WHERE pickup_longitude < -73.95'
    ).df()
find_avg_trip_duration_in_the_west()

Function find_avg_trip_duration_in_the_west took 0.1311047077178955 seconds to run.


Unnamed: 0,avg(trip_duration)
0,9.998599


In [8]:
import sqlite3

conn = sqlite3.connect("taxi.db")

df.to_sql("trips", conn)

@timing_decorator
def find_avg_trip_duration_in_the_west():
    cursor = conn.cursor()
    cursor.execute(
        "SELECT AVG(trip_duration) FROM trips WHERE pickup_longitude < -73.95"
    )
    result = cursor.fetchone()[0]
    cursor.close()
    return result

find_avg_trip_duration_in_the_west()

Function find_avg_trip_duration_in_the_west took 0.8607945442199707 seconds to run.


9.99859886494924