In [97]:
import pandas as pd
import os
import gzip
import requests

In [98]:
pd.__version__

'1.5.2'

In [99]:
def downloadCSVFile(csv_url,name):

    # Define the directory where the CSV file should be saved
    directory = "data"
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Define the filename for the CSV file
    filename = os.path.join(directory, name)

    # Check if the CSV file exists in the directory
    if not os.path.isfile(filename):
        print(f"Downloading {filename}...")
        # Download the CSV file from the URL
        response = requests.get(csv_url)
        # Check if the request was successful
        if response.status_code == 200:
            # Save the CSV data to a file
            with open(filename, 'wb') as file:
                file.write(response.content)
            print("Download completed.")
        else:
            print("Failed to download the file.")
            exit()

    # Read the CSV file into a pandas DataFrame
    print(f"Reading {filename}...")

    return filename

In [100]:
def unzipFile(gzipped_filename,name):
    directory="data"
    gzipped_filename = os.path.join(directory, gzipped_filename)
    csv_filename = os.path.join(directory, name)

    if not os.path.isfile(csv_filename):
        print(f"Unzipping {gzipped_filename}...")
        with gzip.open(gzipped_filename, 'rb') as f_in:
            with open(csv_filename, 'wb') as f_out:
                f_out.write(f_in.read())



In [101]:
zippedname="green.csv.gz"
unzippedname="green.csv"


In [102]:
if not os.path.isfile('./data/green.csv'):
    downloadCSVFile("https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz",zippedname)
    unzipFile(zippedname,unzippedname)

Downloading data/green.csv.gz...
Download completed.
Reading data/green.csv.gz...
Unzipping data/green.csv.gz...


In [103]:
df=pd.read_csv('./data/green.csv')

  df=pd.read_csv('./data/green.csv')


In [104]:
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [105]:
from sqlalchemy import create_engine

In [106]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [107]:
print(pd.io.sql.get_schema(df, name='green_taxi_data', con=engine))


CREATE TABLE green_taxi_data (
	"VendorID" FLOAT(53), 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" FLOAT(53), 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type FLOAT(53), 
	trip_type FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [108]:
df_iter = pd.read_csv('./data/green.csv', iterator=True, chunksize=100000)

In [109]:
df = next(df_iter)

In [110]:
len(df)

100000

In [111]:
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [112]:
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-09-01 00:10:53,2019-09-01 00:23:46,N,1,65,189,5,2.00,10.5,0.5,0.5,2.36,0.0,,0.3,14.16,1,1,0.0
1,2,2019-09-01 00:31:22,2019-09-01 00:44:37,N,1,97,225,5,3.20,12.0,0.5,0.5,0.00,0.0,,0.3,13.30,2,1,0.0
2,2,2019-09-01 00:50:24,2019-09-01 01:03:20,N,1,37,61,5,2.99,12.0,0.5,0.5,0.00,0.0,,0.3,13.30,2,1,0.0
3,2,2019-09-01 00:27:06,2019-09-01 00:33:22,N,1,145,112,1,1.73,7.5,0.5,0.5,1.50,0.0,,0.3,10.30,1,1,0.0
4,2,2019-09-01 00:43:23,2019-09-01 00:59:54,N,1,112,198,1,3.42,14.0,0.5,0.5,3.06,0.0,,0.3,18.36,1,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2,2019-09-08 21:34:31,2019-09-08 21:42:44,N,1,74,151,1,2.12,8.5,0.5,0.5,2.45,0.0,,0.3,12.25,1,1,0.0
99996,2,2019-09-08 21:06:54,2019-09-08 21:12:38,N,1,130,28,1,0.83,5.5,0.5,0.5,0.00,0.0,,0.3,6.80,2,1,0.0
99997,2,2019-09-08 21:22:10,2019-09-08 21:29:33,N,1,130,10,1,2.56,9.5,0.5,0.5,2.00,0.0,,0.3,12.80,1,1,0.0
99998,2,2019-09-08 21:33:42,2019-09-08 21:33:48,N,5,92,92,1,0.13,22.0,0.0,0.0,0.00,0.0,,0.0,22.00,1,2,0.0


In [113]:
df.head(n=0).to_sql(name='green_taxi_data', con=engine, if_exists='replace')

0

In [114]:
%time df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

CPU times: user 4.62 s, sys: 42.7 ms, total: 4.67 s
Wall time: 7.92 s


1000

In [115]:
from time import time

In [116]:
while True: 
    t_start = time()
    
    try:
        df = next(df_iter)
    except StopIteration:
        print("No more data chunks to process. Exiting loop.")
        break
    
    df = next(df_iter)

    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

    t_end = time()

    print('inserted another chunk, took %.3f second' % (t_end - t_start))

inserted another chunk, took 8.186 second


  df = next(df_iter)


inserted another chunk, took 3.341 second
No more data chunks to process. Exiting loop.


In [117]:
csv_url = "https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"
name="zones.csv"
df_zones=pd.read_csv(downloadCSVFile(csv_url,name))


Downloading data/zones.csv...
Download completed.
Reading data/zones.csv...


In [118]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [119]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265