In [1]:
import pandas as pd
from time import time
from sqlalchemy import create_engine

In [2]:
df = pd.read_csv('yellow_tripdata_2021-01.csv', nrows=100)

In [3]:
pd.io.sql.get_schema(df, 'yellow_taxi_data')

'CREATE TABLE "yellow_taxi_data" (\n"Unnamed: 0" INTEGER,\n  "VendorID" INTEGER,\n  "tpep_pickup_datetime" TEXT,\n  "tpep_dropoff_datetime" TEXT,\n  "passenger_count" REAL,\n  "trip_distance" REAL,\n  "RatecodeID" REAL,\n  "store_and_fwd_flag" TEXT,\n  "PULocationID" INTEGER,\n  "DOLocationID" INTEGER,\n  "payment_type" INTEGER,\n  "fare_amount" REAL,\n  "extra" REAL,\n  "mta_tax" REAL,\n  "tip_amount" REAL,\n  "tolls_amount" REAL,\n  "improvement_surcharge" REAL,\n  "total_amount" REAL,\n  "congestion_surcharge" REAL,\n  "airport_fee" REAL\n)'

In [4]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [5]:
engine  = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [6]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x2aaf943c990>

In [7]:
print(pd.io.sql.get_schema(df, 'yellow_taxi_data', con = engine))


CREATE TABLE yellow_taxi_data (
	"Unnamed: 0" BIGINT, 
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [8]:
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', iterator=True, chunksize=100000)

In [9]:
df  = next(df_iter)

In [10]:
df.head(n=0).to_sql(name = 'yellow_taxi_data',con=engine, if_exists='replace')

0

In [11]:
%time df.to_sql(name = 'yellow_taxi_data',con=engine, if_exists='append')

CPU times: total: 8.52 s
Wall time: 20.7 s


1000

In [12]:
while True:
    t_start = time()
    
    df = next(df_iter)
    
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    
    df.to_sql(name = 'yellow_taxi_data', con=engine, if_exists='append')
    
    t_end = time()
    
    print('inserted anotherchunk..., took %.3f seconds' %(t_end - t_start))
    


inserted anotherchunk..., took 22.932 seconds
inserted anotherchunk..., took 25.403 seconds
inserted anotherchunk..., took 29.918 seconds
inserted anotherchunk..., took 30.240 seconds
inserted anotherchunk..., took 20.851 seconds


StopIteration: 

In [13]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [14]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [15]:
df_zones.to_sql(name = 'zones', con=engine, if_exists='replace')

265