In [44]:
# Import libraries
import pandas as pd
import pyarrow.parquet as pq
from sqlalchemy import create_engine
from time import t

In [45]:
# Create connection with our Postgres database
engine = create_engine("postgresql://root:root@localhost:5431/ny_taxi")
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f8e93b125e0>

In [33]:
# Read the dataset and identify the columns needed to CREATE TABLE
parquet_file = pq.ParquetFile("yellow_tripdata_2021-01.parquet")

for batch in parquet_file.iter_batches():
    batch_df = batch.to_pandas()
    print(pd.io.sql.get_schema(batch_df, name="yellow_taxi_data", con=engine))
    break


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [36]:
# CREATE TABLE with .to_sql
batch_df.head(n=0).to_sql(name="yellow_taxi_data", con=engine, if_exists="replace")

0

In [38]:
# Start data ingestion by chuncks.
for batch in parquet_file.iter_batches():
    t_start = time()
    
    batch_df = batch.to_pandas()
    batch_df.to_sql(name="yellow_taxi_data", con=engine, if_exists="append")
    t_end = time()
    
    print("inserted another chunck, took %.3f second"%(t_end - t_start))

inserted another chunck, took 8.389 second
inserted another chunck, took 8.292 second
inserted another chunck, took 8.895 second
inserted another chunck, took 8.290 second
inserted another chunck, took 11.560 second
inserted another chunck, took 9.094 second
inserted another chunck, took 8.213 second
inserted another chunck, took 8.115 second
inserted another chunck, took 8.454 second
inserted another chunck, took 8.568 second
inserted another chunck, took 9.499 second
inserted another chunck, took 8.528 second
inserted another chunck, took 10.213 second
inserted another chunck, took 8.616 second
inserted another chunck, took 8.381 second
inserted another chunck, took 8.558 second
inserted another chunck, took 8.750 second
inserted another chunck, took 8.800 second
inserted another chunck, took 9.303 second
inserted another chunck, took 9.165 second
inserted another chunck, took 6.847 second


In [41]:
# Read taxi zones information 
zones_df = pd.read_csv("taxi+_zone_lookup.csv")

In [42]:
# Identify the columns needed to CREATE TABLE
print(pd.io.sql.get_schema(df, name="taxi_zones", con=engine))


CREATE TABLE taxi_zones (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [47]:
# Store data to Postgress database with .to_sql
zones_df.to_sql(name="taxi_zones", con=engine, if_exists='replace')

265

In [48]:
zones_df

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,
