In [1]:
import os
from urllib.request import urlretrieve

import pandas as pd
from sqlalchemy import create_engine

In [2]:
url = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv"
outpath = os.path.join(os.path.abspath("."), "ny_taxi_postgres_data_raw", "yellow_tripdata_2021-01.csv")
if not os.path.exists(outpath):
    os.makedirs(os.path.dirname(outpath), exist_ok=True)
    urlretrieve(url=url, filename=outpath)
else:
    print("Already downloaded that data :)")

Already downloaded that data :)


[Yellow Taxi Data Dictionary](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)

In [3]:
taxi_df = pd.read_csv(outpath)

  taxi_df = pd.read_csv(outpath)


In [4]:
taxi_df.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2.0,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1.0,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2.0,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0


In [5]:
date_cols = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]
for date_col in date_cols:
    taxi_df[date_col] = pd.to_datetime(taxi_df[date_col], format="%Y-%m-%d %H:%M:%S")

In [6]:
taxi_df.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2.0,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1.0,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2.0,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0


In [7]:
data_ddl = pd.io.sql.get_schema(taxi_df, "yellow_taxi_data")

In [8]:
print(data_ddl)

CREATE TABLE "yellow_taxi_data" (
"VendorID" REAL,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" REAL,
  "trip_distance" REAL,
  "RatecodeID" REAL,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" REAL,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL
)


That's fine as generic DDL, but we have a postgres db, so we'll need to use postgres ddl. We can still use the above helper function from pandas, we just need to communicate we want postgres ddl, which we can do by passing in a connection (via a `sqlalchemy` engine) to our postgres database.

The structure of the connection string is 
`<driver>://<password>:<user>@<hostname>:<port_num>/<db_name>`

In [9]:
engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")

In [10]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7ff079a39d00>

In [11]:
data_ddl = pd.io.sql.get_schema(taxi_df, "yellow_taxi_data", con=engine)
print(data_ddl)


CREATE TABLE yellow_taxi_data (
	"VendorID" FLOAT(53), 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




The statement above is what pandas will use to create the empty table in our postgres db.

# Loading data

I loaded the entire file into memory, but in the video, Alexey makes a generator to ingest the data in chunks.

In [12]:
def set_datetime_col_dtypes(taxi_df: pd.DataFrame) -> pd.DataFrame:
    date_cols = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]
    for date_col in date_cols:
        taxi_df[date_col] = pd.to_datetime(taxi_df[date_col], format="%Y-%m-%d %H:%M:%S")
    return taxi_df    

## Creating the empty table

We have to set up the table before we can load data into it, and we want to start clean before we load all of our chunks. We already set up the timestamp datatypes on our `taxi_df`, so we can use the empty head to up the table, then we can use our iterator to start loading chunks.

In [13]:
taxi_df.head(0)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge


In [14]:
taxi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369765 entries, 0 to 1369764
Data columns (total 18 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1271413 non-null  float64       
 1   tpep_pickup_datetime   1369765 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  1369765 non-null  datetime64[ns]
 3   passenger_count        1271413 non-null  float64       
 4   trip_distance          1369765 non-null  float64       
 5   RatecodeID             1271413 non-null  float64       
 6   store_and_fwd_flag     1271413 non-null  object        
 7   PULocationID           1369765 non-null  int64         
 8   DOLocationID           1369765 non-null  int64         
 9   payment_type           1271413 non-null  float64       
 10  fare_amount            1369765 non-null  float64       
 11  extra                  1369765 non-null  float64       
 12  mta_tax                13697

In [15]:
taxi_df.head(0).to_sql(name="yellow_taxi_data", con=engine, if_exists="replace")

0

Now we can start loading chunks.

In [16]:
taxi_df_iter = pd.read_csv(outpath, iterator=True, chunksize=100000)

In [17]:
for taxi_df_chunk in taxi_df_iter:
    taxi_df_chunk = set_datetime_col_dtypes(taxi_df_chunk)
    taxi_df_chunk.to_sql(name="yellow_taxi_data", con=engine, if_exists="append")

  for taxi_df_chunk in taxi_df_iter:


From a command line, we can access `pgcli` while that is running (will take about a minute) via

```bash
pgcli -h localhost -p 5432 -u root -d ny_taxi
```

and we can see how many records have loaded by submitting the query

```sql
root@localhost:ny_taxi> SELECT COUNT(*) FROM yellow_taxi_data;
+---------+
| count   |
|---------|
| 1000000 |
+---------+
SELECT 1
Time: 0.039s
```