In [None]:
import pandas as pd

In [None]:
pd.__version__

In [None]:
# yellow_taxi_data / green_taxi_data
taxi_table_name='yellow_taxi_data' 
zones_table_name='zones'

## Ingest Taxi table

In [None]:
# !wget "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2021-01.csv.gz"

In [None]:
df = pd.read_csv('yellow_tripdata_2021-01.csv.gz', nrows=100, compression='gzip')

In [None]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [None]:
from sqlalchemy import create_engine

In [None]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [None]:
print(pd.io.sql.get_schema(df, name=taxi_table_name, con=engine))

In [None]:
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', iterator=True, chunksize=100000)

In [None]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [None]:
df

In [None]:
df.head(n=0).to_sql(name=taxi_table_name, con=engine, if_exists='replace')

In [None]:
%time df.to_sql(name=taxi_table_name, con=engine, if_exists='append') 

In [None]:
from time import time

In [None]:
while True: 

    try:
        t_start = time()
        
        df = next(df_iter)

        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

        # low_memory=False after error message:
        # "DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.""
        df.to_sql(name=taxi_table_name, con=engine, if_exists='append', low_memory=False)

        t_end = time()

        print('inserted another chunk, took %.3f second' % (t_end - t_start))

    except StopIteration:
        print("Finished ingesting data into the postgres database")
        break

Which column is column (6)?

In [None]:
query = """
SELECT "VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", "RatecodeID", "store_and_fwd_flag"
FROM yellow_taxi_data
"""

df_sql = pd.read_sql(sql=query, con=engine)

In [None]:
df_sql.sample(10)

In [None]:
df_sql.value_counts('RatecodeID', dropna=False)

In [None]:
df_sql.value_counts('store_and_fwd_flag', dropna=False)

Note: seems that the mixed types means there's NaN and None that the engine takes a long time to infer, thus the memory warning.

Suggested solution: explitly specify the dTypes or `low_memory=False`, as the error message says. Opted for the low_memory setting as the transformations and data cleaning would be handled by Mage pipelines in the following weeks' lessons. So, no point doing it here.

## Ingest Zones table

In [None]:
# !wget "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv"

In [None]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [None]:
df_zones.head()

In [None]:
df_zones.to_sql(name=zones_table_name, con=engine, if_exists='replace')

## Confirm upload by query from database instead of csv

In [None]:
query = """
SELECT COUNT(*) 
FROM yellow_taxi_data
"""

pd.read_sql(sql=query, con=engine)

In [None]:
query = """
SELECT COUNT(*) 
FROM zones
"""

pd.read_sql(sql=query, con=engine)