# Ingesting taxi zone lookups to postgres container

# Imports

In [None]:
import pandas as pd

# Read csv

In [None]:
df = pd.read_csv('../data/taxi_zone_lookup.csv')

# Create connection to postgres with sqlalchemy create_engine

In [None]:
# Import create_engine from sqlalchemy
from sqlalchemy import create_engine

In [None]:
# Create an engine to connect with container
# IF REAL DB, LOAD AS VARIABLE!!!! 
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [None]:
# Connect the engine
engine.connect()

## Print schema for taxi zone lookup

In [None]:
# this step is a great way to identify if dtypes should be changed
# specify dataframe, name of the table, and the connector (engine variable in this case)
print(pd.io.sql.get_schema(df, name='zones', con=engine))

# We will use the head and to_sql method to create a table, then insert the rows.
> Note: the `.head()` method is used to create the table first, the next step will be to ingest the data.

## Create table with schema

> `.to_sql` arguments: name of table, connector, and what to do if the table exists (fail, replace, append)

In [None]:
df.head(n=0).to_sql(name='zones', con=engine, if_exists='replace')

## Ingest data
> Note we use `if_exists='append'` because the previous step created the table.

That took a lot of time!  I need to look into pyarrow and chunking instead of running blind

In [None]:
%time df.to_sql(name='zones', con=engine, if_exists='append')

## Look at database catalog

In [None]:
catalog = '''
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema';
'''
pd.read_sql(catalog, con=engine)

## Looking at sample of data

In [None]:
query = '''
SELECT *
FROM zones
LIMIT 10;
'''
pd.read_sql(query, con=engine)