# Imports

In [1]:
import pandas as pd

pd.__version__

'2.0.2'

# Read in paraquet
> Note: the example in the video uses csv files. The official website has changed to paraquet files.

In [2]:
df = pd.read_parquet('../ny_taxi_postgres_data/yellow_tripdata_2021-01.parquet')

# Create connection to postgres with sqlalchemy create_engine

In [3]:
# Import create_engine from sqlalchemy
from sqlalchemy import create_engine

In [4]:
# Create an engine to connect with container
# IF REAL DB, LOAD AS VARIABLE!!!! 
engine = create_engine('postgresql://root:root@localhost:5410/ny_taxi')

In [5]:
# Connect the engine
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fe0f13230d0>

## Print schema for yellow_taxi_data

In [6]:
# this step is a great way to identify if dtypes should be changed
# specify dataframe, name of the table, and the connector (engine variable in this case)
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




> If this was a big csv, you'd need to create iterator to ingest in chunks.  If data types need to be changed, you'd add that code after the iterator.

`df_iter = pd.read_csv('../ny_taxi_postgres_data/yellow_tripdata_2021-01.csv', iterator=True, chunksize=1000000)`

# We will use the head and to_sql method to create a table, then insert the rows.
> Note: the `.head()` method is used to create the table first, the next step will be to ingest the data.  I do not know if this step is strictly needed for parquet files that don't need to be chunked.  I'll include it as I'm sure that I'll run into huge csv files in the wild.

## Create table with schema

> `.to_sql` arguments: name of table, connector, and what to do if the table exists (fail, replace, append)

In [7]:
df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

## Ingest data
> Note we use `if_exists='append'` because the previous step created the table.

That took a lot of time!  I need to look into pyarrow and chunking instead of running blind

In [8]:
%time df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

CPU times: user 1min 19s, sys: 2.08 s, total: 1min 21s
Wall time: 2min 19s


769

## Look at database catalog

In [9]:
catalog = '''
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema';
'''
pd.read_sql(catalog, con=engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,public,yellow_taxi_data,root,,True,False,False,False


## Looking at sample of data

In [10]:
query = '''
SELECT *
FROM yellow_taxi_data
LIMIT 10;
'''
pd.read_sql(query, con=engine)

Unnamed: 0,index,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,
1,1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,
2,2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,
3,3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,
4,4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,
5,5,1,2021-01-01 00:16:29,2021-01-01 00:24:30,1.0,1.6,1.0,N,224,68,1,8.0,3.0,0.5,2.35,0.0,0.3,14.15,2.5,
6,6,1,2021-01-01 00:00:28,2021-01-01 00:17:28,1.0,4.1,1.0,N,95,157,2,16.0,0.5,0.5,0.0,0.0,0.3,17.3,0.0,
7,7,1,2021-01-01 00:12:29,2021-01-01 00:30:34,1.0,5.7,1.0,N,90,40,2,18.0,3.0,0.5,0.0,0.0,0.3,21.8,2.5,
8,8,1,2021-01-01 00:39:16,2021-01-01 01:00:13,1.0,9.1,1.0,N,97,129,4,27.5,0.5,0.5,0.0,0.0,0.3,28.8,0.0,
9,9,1,2021-01-01 00:26:12,2021-01-01 00:39:46,2.0,2.7,1.0,N,263,142,1,12.0,3.0,0.5,3.15,0.0,0.3,18.95,2.5,
