In [17]:
import os
import pandas as pd
from sqlalchemy import create_engine
from time import time


In [5]:
# Change the current working directory
os.chdir(r'/Users/aurafrizzati/Desktop/DE-Zoomcamp-AF/Module_1')

**First glance at the data**

In [12]:
df = pd.read_csv('green_tripdata_2019-10.csv.gz')
df

  df = pd.read_csv('green_tripdata_2019-10.csv.gz')


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2.0,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1.0,112,196,1.0,5.88,18.00,0.50,0.5,0.00,0.00,,0.3,19.30,2.0,1.0,0.0
1,1.0,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1.0,43,263,1.0,0.80,5.00,3.25,0.5,0.00,0.00,,0.3,9.05,2.0,1.0,0.0
2,1.0,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1.0,255,228,2.0,7.50,21.50,0.50,0.5,0.00,0.00,,0.3,22.80,2.0,1.0,0.0
3,1.0,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1.0,181,181,1.0,0.90,5.50,0.50,0.5,0.00,0.00,,0.3,6.80,2.0,1.0,0.0
4,2.0,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1.0,97,188,1.0,2.52,10.00,0.50,0.5,2.26,0.00,,0.3,13.56,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476381,,2019-10-31 23:30:00,2019-11-01 00:00:00,,,65,102,,7.04,29.57,2.75,0.5,0.00,0.00,,0.0,32.82,,,
476382,,2019-10-31 23:03:00,2019-10-31 23:24:00,,,129,136,,0.00,39.83,2.75,0.5,0.00,6.12,,0.0,49.20,,,
476383,,2019-10-31 23:02:00,2019-10-31 23:23:00,,,61,222,,3.90,23.11,2.75,0.5,0.00,0.00,,0.0,26.36,,,
476384,,2019-10-31 23:42:00,2019-10-31 23:56:00,,,76,39,,3.08,15.23,2.75,0.5,0.00,0.00,,0.0,18.48,,,


In [8]:
## convert the datetime columns into timestamp
df.lpep_pickup_datetime = pd.to_datetime(df['lpep_pickup_datetime'])
df.lpep_dropoff_datetime = pd.to_datetime(df['lpep_dropoff_datetime'])

In [9]:
## generate the table schema for postgres database
## the table schema is in DDL format = Data Definition Language
print(pd.io.sql.get_schema(df, name="green_taxi_data"))

CREATE TABLE "green_taxi_data" (
"VendorID" INTEGER,
  "lpep_pickup_datetime" TIMESTAMP,
  "lpep_dropoff_datetime" TIMESTAMP,
  "store_and_fwd_flag" TEXT,
  "RatecodeID" INTEGER,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "ehail_fee" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "payment_type" INTEGER,
  "trip_type" INTEGER,
  "congestion_surcharge" REAL
)


**Ingesting the green taxi table into the postgres sql server running in the background**

In [10]:
## create a connection to the postgres sql server
engine = create_engine(
    'postgresql://root:root@localhost:5432/ny_taxi')
## postgresql = the database dialect and driver
## root:root = username:password
## localhost:5432 = hostname:port
## ny_taxi = the database name

In [11]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x1323a6cf0>

In [13]:
# intialise the ingestion of the input file using an iterator
df_iter = pd.read_csv(
    "green_tripdata_2019-10.csv.gz", # 'yellow_tripdata_2021-01.csv' ## this can also be a *.csv.gz file
    iterator=True,
    chunksize=100000
        )

df = next(df_iter)

In [15]:
## convert the datetime columns into timestamp
df.lpep_dropoff_datetime = pd.to_datetime(df['lpep_dropoff_datetime'])
df.lpep_pickup_datetime = pd.to_datetime(df['lpep_pickup_datetime'])

## Add the table column names in the database
df.head(n=0).to_sql(
    con=engine,
    name = "green_taxi_data", 
    if_exists='replace'
    )


0

In [16]:
df.to_sql(
    name = "green_taxi_data", 
    con = engine,
    if_exists='append'
)

1000

In [None]:
## get all the rest of the chunks via the iterator in the database
while True:
    try:
        t_start = time()

        df = next(df_iter)

        ## convert the datetime columns into timestamp
        df.lpep_pickup_datetime = pd.to_datetime(df['lpep_pickup_datetime'])
        df.lpep_dropoff_datetime = pd.to_datetime(df['lpep_dropoff_datetime'])

        df.to_sql(
            name = "green_taxi_data", 
            con = engine,
            if_exists='append'
        )

        t_end = time()

        print(f"inserted another chunk..., took {t_end - t_start:.3f} seconds")
    except StopIteration:
        print("Finished ingesting the data into the postgres database")
        break