# Ingesting taxi zone lookups to postgres container

# Imports

In [1]:
import pandas as pd

# Read csv

In [2]:
df = pd.read_csv('../data/taxi_zone_lookup.csv')

# Create connection to postgres with sqlalchemy create_engine

In [3]:
# Import create_engine from sqlalchemy
from sqlalchemy import create_engine

In [4]:
# Create an engine to connect with container
# IF REAL DB, LOAD AS VARIABLE!!!! 
engine = create_engine('postgresql://root:root@localhost:5410/ny_taxi')

In [5]:
# Connect the engine
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f9b31c5e040>

## Print schema for taxi zone lookup

In [6]:
# this step is a great way to identify if dtypes should be changed
# specify dataframe, name of the table, and the connector (engine variable in this case)
print(pd.io.sql.get_schema(df, name='zones', con=engine))


CREATE TABLE zones (
	"LocationID" BIGINT, 
	"Borough" TEXT, 
	"Zone" TEXT, 
	service_zone TEXT
)




# We will use the head and to_sql method to create a table, then insert the rows.
> Note: the `.head()` method is used to create the table first, the next step will be to ingest the data.

## Create table with schema

> `.to_sql` arguments: name of table, connector, and what to do if the table exists (fail, replace, append)

In [7]:
df.head(n=0).to_sql(name='zones', con=engine, if_exists='replace')

0

## Ingest data
> Note we use `if_exists='append'` because the previous step created the table.

That took a lot of time!  I need to look into pyarrow and chunking instead of running blind

In [8]:
%time df.to_sql(name='zones', con=engine, if_exists='append')

CPU times: user 9.44 ms, sys: 0 ns, total: 9.44 ms
Wall time: 23.7 ms


265

## Look at database catalog

In [9]:
catalog = '''
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema';
'''
pd.read_sql(catalog, con=engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,public,yellow_taxi_trips,root,,True,False,False,False
1,public,zones,root,,True,False,False,False


## Looking at sample of data

In [10]:
query = '''
SELECT *
FROM zones
LIMIT 10;
'''
pd.read_sql(query, con=engine)

Unnamed: 0,index,LocationID,Borough,Zone,service_zone
0,0,1,EWR,Newark Airport,EWR
1,1,2,Queens,Jamaica Bay,Boro Zone
2,2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,3,4,Manhattan,Alphabet City,Yellow Zone
4,4,5,Staten Island,Arden Heights,Boro Zone
5,5,6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone
6,6,7,Queens,Astoria,Boro Zone
7,7,8,Queens,Astoria Park,Boro Zone
8,8,9,Queens,Auburndale,Boro Zone
9,9,10,Queens,Baisley Park,Boro Zone
