## Load New York Taxi Data

In [100]:
import pandas as pd
from sqlalchemy import create_engine, text
from urllib.parse import quote_plus
from time import time

In [101]:
trip_data_file_path = "/workspaces/data-engineering-zoomcamp/data/green_tripdata_2019-09.csv.gz"
green_table_name = "green_tripdata_2019_09"

In [102]:
# !wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz -O trip_data_file_path

In [103]:
df_first_rows = pd.read_csv(trip_data_file_path, nrows=1000)

In [104]:
df_first_rows.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-09-01 00:10:53,2019-09-01 00:23:46,N,1,65,189,5,2.0,10.5,0.5,0.5,2.36,0.0,,0.3,14.16,1,1,0.0
1,2,2019-09-01 00:31:22,2019-09-01 00:44:37,N,1,97,225,5,3.2,12.0,0.5,0.5,0.0,0.0,,0.3,13.3,2,1,0.0
2,2,2019-09-01 00:50:24,2019-09-01 01:03:20,N,1,37,61,5,2.99,12.0,0.5,0.5,0.0,0.0,,0.3,13.3,2,1,0.0
3,2,2019-09-01 00:27:06,2019-09-01 00:33:22,N,1,145,112,1,1.73,7.5,0.5,0.5,1.5,0.0,,0.3,10.3,1,1,0.0
4,2,2019-09-01 00:43:23,2019-09-01 00:59:54,N,1,112,198,1,3.42,14.0,0.5,0.5,3.06,0.0,,0.3,18.36,1,1,0.0


### Display data types

In [105]:
df_first_rows.dtypes

VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                  int64
congestion_surcharge     float64
dtype: object

### Change datatype for pickup and dropoff datetime

In [106]:
df_first_rows.lpep_pickup_datetime = pd.to_datetime(df_first_rows.lpep_pickup_datetime)
df_first_rows.lpep_dropoff_datetime = pd.to_datetime(df_first_rows.lpep_dropoff_datetime)

In [107]:
df_first_rows.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                         int64
congestion_surcharge            float64
dtype: object

In [108]:
df_first_rows.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-09-01 00:10:53,2019-09-01 00:23:46,N,1,65,189,5,2.0,10.5,0.5,0.5,2.36,0.0,,0.3,14.16,1,1,0.0
1,2,2019-09-01 00:31:22,2019-09-01 00:44:37,N,1,97,225,5,3.2,12.0,0.5,0.5,0.0,0.0,,0.3,13.3,2,1,0.0
2,2,2019-09-01 00:50:24,2019-09-01 01:03:20,N,1,37,61,5,2.99,12.0,0.5,0.5,0.0,0.0,,0.3,13.3,2,1,0.0
3,2,2019-09-01 00:27:06,2019-09-01 00:33:22,N,1,145,112,1,1.73,7.5,0.5,0.5,1.5,0.0,,0.3,10.3,1,1,0.0
4,2,2019-09-01 00:43:23,2019-09-01 00:59:54,N,1,112,198,1,3.42,14.0,0.5,0.5,3.06,0.0,,0.3,18.36,1,1,0.0


### Show query to create a table

In [109]:
encoded_password = quote_plus("P@ssw0rd!")
encoded_password

'P%40ssw0rd%21'

In [110]:
engine = create_engine(f"postgresql://postgres:{encoded_password}@db:5432/ny_taxi")

In [111]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7ff852239610>

In [112]:
print(pd.io.sql.get_schema(df_first_rows, name=green_table_name, con=engine))


CREATE TABLE green_tripdata_2019_09 (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




In [113]:
df_without_low_memory = pd.read_csv(trip_data_file_path, parse_dates=['lpep_pickup_datetime', 'lpep_dropoff_datetime'], low_memory=False)

In [114]:
ddl = pd.io.sql.get_schema(df_without_low_memory, name=green_table_name, con=engine)
print(ddl)
# Please notice that three columns ahve different types in the schema in comparison to the previous one


CREATE TABLE green_tripdata_2019_09 (
	"VendorID" FLOAT(53), 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" FLOAT(53), 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type FLOAT(53), 
	trip_type FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




### Create a table

In [115]:
# Execute the DDL statement to create the table schema
with engine.connect() as connection:
    connection.execute(text(f"DROP TABLE IF EXISTS {green_table_name}"))
    connection.commit()
    connection.execute(text(ddl))
    connection.commit()

In [117]:
pd.read_sql(f"SELECT * FROM {green_table_name} LIMIT 1", engine)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge


### Use iterator to upload data in chunks of 100000 rows

In [118]:
df_iterator = pd.read_csv(trip_data_file_path, parse_dates=['lpep_pickup_datetime', 'lpep_dropoff_datetime'], iterator=True, chunksize=100000)

In [119]:
df_iterator

<pandas.io.parsers.readers.TextFileReader at 0x7ff85215e960>

In [120]:
while True:
  start = time()
  df = next(df_iterator, None)
  if df is None:
      break
  
  df.to_sql(green_table_name, engine, if_exists="append", index=False)
  end = time()

  print(f"Loaded 100k rows in {end - start} seconds")
  

Loaded 100k rows in 20.12166166305542 seconds
Loaded 100k rows in 18.472597122192383 seconds
Loaded 100k rows in 22.291062831878662 seconds


  df = next(df_iterator, None)


Loaded 100k rows in 19.105767488479614 seconds
Loaded 100k rows in 7.500225067138672 seconds


In [121]:
pd.read_sql(f"SELECT COUNT(*) FROM {green_table_name}", engine)

Unnamed: 0,count
0,449063
