[Reference](https://medium.com/@tubelwj/explore-the-speed-of-duckdb-with-hundred-million-row-csv-files-9fd64d4e4105)

In [2]:
!pip install faker

Collecting faker
  Downloading Faker-26.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-26.1.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-26.1.0


In [4]:
import pandas as pd
from faker import Faker
import numpy as np
import random

# Initialize Faker
fake = Faker()

# Define a function to generate data
def generate_data(n):
    unique_cities_list = [fake.city() for _ in range(1000)]  # Directly call the city() method
    data = {
        'city': [fake.random_element(elements=unique_cities_list) for _ in range(n)],
        'low_temp': [fake.random_int(min=-50, max=50) for _ in range(n)],  # Assume temperature range is -50 to 50 degrees Celsius
        'high_temp': [fake.random_int(min=-50, max=50) for _ in range(n)],
        'precipitation': [round(random.uniform(0, 100), 2) for _ in range(n)],  # Assume precipitation range is 0 to 100 mm
        'humidity': [round(random.uniform(0, 100), 2) for _ in range(n)],  # Assume humidity range is 0% to 100%
        'pressure': [fake.random_int(min=950, max=1050) for _ in range(n)]  # Assume pressure range is 950 to 1050 hPa
    }
    return pd.DataFrame(data)

# Generate 100 million records
n = 1000
df = generate_data(n)

# Due to the large amount of data, we choose to write the data to files
# Writing to CSV files, using chunked writing to avoid running out of memory
chunksize = 1000  # Define the size of each data chunk
for i, chunk in enumerate(np.array_split(df, n // chunksize)):
    chunk.to_csv(f'weather_data_part_{i}.csv', index=False)

print(f'Generated {n} records and saved to CSV files.')

Generated 1000 records and saved to CSV files.


  return bound(*args, **kwds)


In [9]:
import duckdb
from datetime import datetime

start_time = datetime.now()  # recording the start time
data = duckdb.read_csv("./weather_data_part_0.csv")
print(duckdb.sql("select count(*) from data"))
end_time = datetime.now()  # recording the end time
execution_time = (end_time - start_time).total_seconds()
print(f"Execution time: {execution_time} seconds")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         1000 │
└──────────────┘

Execution time: 0.02348 seconds


In [10]:
print(duckdb.sql("select city, count(low_temp), count(high_temp) from data group by city limit 10;"))

┌─────────────────┬─────────────────┬──────────────────┐
│      city       │ count(low_temp) │ count(high_temp) │
│     varchar     │      int64      │      int64       │
├─────────────────┼─────────────────┼──────────────────┤
│ New Williambury │               2 │                2 │
│ Dyerview        │               2 │                2 │
│ Pamelaville     │               4 │                4 │
│ North Gary      │               2 │                2 │
│ Katherinemouth  │               1 │                1 │
│ Diazland        │               2 │                2 │
│ Port Johntown   │               3 │                3 │
│ Reginaberg      │               3 │                3 │
│ Jamesland       │               1 │                1 │
│ Lake Trevorton  │               1 │                1 │
├─────────────────┴─────────────────┴──────────────────┤
│ 10 rows                                    3 columns │
└──────────────────────────────────────────────────────┘



In [11]:
print(duckdb.sql("create table city_weather as select * from 'my_folder/*.csv';"))
print(duckdb.sql("select * from city_weather limit 10;"))

IOException: IO Error: No files found that match the pattern "my_folder/*.csv"

In [15]:
duckdb.sql("create table city_weather as select * from 'weather_data_part_0.csv';")
duckdb.sql("copy city_weather to  'city_weather.csv' with(header);")