In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pr
from sqlalchemy import create_engine

In [2]:
trips = pr.read_table('yellow_tripdata_2021-01.parquet')
trips = trips.to_pandas()
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369769 entries, 0 to 1369768
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1369769 non-null  int64         
 1   tpep_pickup_datetime   1369769 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  1369769 non-null  datetime64[ns]
 3   passenger_count        1271417 non-null  float64       
 4   trip_distance          1369769 non-null  float64       
 5   RatecodeID             1271417 non-null  float64       
 6   store_and_fwd_flag     1271417 non-null  object        
 7   PULocationID           1369769 non-null  int64         
 8   DOLocationID           1369769 non-null  int64         
 9   payment_type           1369769 non-null  int64         
 10  fare_amount            1369769 non-null  float64       
 11  extra                  1369769 non-null  float64       
 12  mta_tax                13697

In [4]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [5]:
print(pd.io.sql.get_schema(trips, 'yellow_taxi_data', con = engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [6]:
trips.head(0).to_sql(name = 'yellow_taxi_data', con = engine, if_exists = 'replace')

0

In [7]:
def chunkify(df: pd.DataFrame, chunk_size: int):
    start = 0
    length = df.shape[0]

    # If DF is smaller than the chunk, return the DF
    if length <= chunk_size:
        yield df[:]
        return

    # Yield individual chunks
    while start + chunk_size <= length:
        yield df[start:chunk_size + start]
        start = start + chunk_size

    # Yield the remainder chunk, if needed
    if start < length:
        yield df[start:]

In [14]:
trips_chunks = chunkify(trips, 100000)
import time
counter = 0
while True:
    start = time.time()
    df = next(trips_chunks)
    df.to_sql(name = 'yellow_taxi_data', con = engine, if_exists = 'append')
    end = time.time()
    print(f'chunk {counter} inserted in {end - start} seconds..')
    counter+=1

chunk 0 inserted in 44.93833374977112 seconds..
chunk 1 inserted in 35.5839900970459 seconds..
chunk 2 inserted in 34.344501972198486 seconds..
chunk 3 inserted in 33.56255030632019 seconds..
chunk 4 inserted in 32.5176956653595 seconds..
chunk 5 inserted in 41.004575967788696 seconds..
chunk 6 inserted in 34.550740480422974 seconds..
chunk 7 inserted in 34.286784648895264 seconds..
chunk 8 inserted in 33.53672695159912 seconds..
chunk 9 inserted in 38.30320739746094 seconds..
chunk 10 inserted in 43.902639627456665 seconds..
chunk 11 inserted in 38.34683394432068 seconds..
chunk 12 inserted in 32.6505823135376 seconds..
chunk 13 inserted in 33.03904128074646 seconds..


StopIteration: 