# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1VERPjEZcC1XSs4-02aM-DbkNr_yaJVbFjLJxaYQswqA/edit#)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_Anything in italics (prose) or comments (in code) is meant to provide you with guidance. **Remove the italic lines and provided comments** before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading._

_**All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.**_

## Project Setup

In [565]:
# all import statements needed for the project, for example:

import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
from sqlalchemy import text
from sqlalchemy import create_engine

In [892]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = ""
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project_1.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [3]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [6]:
#!pip install geopandas
import geopandas as gpd

In [7]:
def load_taxi_zones(shapefile):
    '''
    This function reads the taxi_zones file and change the location into latitude and longitude format
    '''
    data = gpd.read_file(shapefile)
    
    # change coordinate system since we need to compare to (40.560445, -74.242330) and (40.908524, -73.717047)
    data = data.to_crs(epsg=4326) 

    # calculating latitude and longitude using given geo data
    data["latitude"] = data.geometry.centroid.y
    data["longitude"] = data.geometry.centroid.x
    return data


In [8]:
taxi_zones = load_taxi_zones("taxi_zones.shp")
taxi_zones


  data["latitude"] = data.geometry.centroid.y

  data["longitude"] = data.geometry.centroid.x


Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry,latitude,longitude
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((-74.18445 40.695, -74.18449 40.6951,...",40.691831,-74.174000
1,2,0.433470,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ...",40.616745,-73.831299
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870...",40.864474,-73.847422
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725...",40.723752,-73.976968
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.562...",40.552659,-74.188484
...,...,...,...,...,...,...,...,...,...
258,259,0.126750,0.000395,Woodlawn/Wakefield,259,Bronx,"POLYGON ((-73.85107 40.91037, -73.85207 40.909...",40.897932,-73.852215
259,260,0.133514,0.000422,Woodside,260,Queens,"POLYGON ((-73.90175 40.76078, -73.90147 40.759...",40.744235,-73.906306
260,261,0.027120,0.000034,World Trade Center,261,Manhattan,"POLYGON ((-74.01333 40.70503, -74.01327 40.704...",40.709139,-74.013023
261,262,0.049064,0.000122,Yorkville East,262,Manhattan,"MULTIPOLYGON (((-73.94383 40.78286, -73.94376 ...",40.775932,-73.946510


In [452]:
def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    '''
    This function load takes id and the loaded taxi zone df and return the id's corresponding lat,lon pair
    '''
    zone = loaded_taxi_zones[loaded_taxi_zones["LocationID"] == zone_loc_id]
    if zone.empty:
        return 0.0,0.0
        
    lat = zone["latitude"].values[0]
    lon= zone["longitude"].values[0]
    return lat,lon

def get_coords(id):
    return lookup_coords_for_taxi_zone_id(id, taxi_zones)
    

   

The coordinates for LocationID 1 are: (40.691831206401545, -74.17400027276305)


### Calculate Sample Size

In [11]:
import math

def calculate_sample_size(population):
    '''
    According to the Youtube video
    https://www.youtube.com/watch?v=dRYKi6pIUaU
    Our sample size should be n1 = 385/(1+ 384/N) where N is our poplulation size
    385 is a constant calculated by the video
    '''
    return math.ceil(385/(1+384/population))

'''
test 
'''
calculate_sample_size(2000)

323

### Common Functions

The cells below takes the TLC_URL linke to obtain a list of all yellow cab and fhvhv data parquet links.

### get_all_urls_from_tlc_page(taxi_page)

Str_of_URL -> Str_of_html_content

get_all_urls_from_tlc_page takes the TLC_URL link and return its html content for filter_parquet_urls to continue



### filter_parquet_urls(all_urls)

Str_of_html_content -> List_of_parquet_links

filter_parquet_urls takes the html content returned by previous function and return a list of all yellow cabs and fhvhv cars' link

In [14]:
def get_all_urls_from_tlc_page(taxi_page):
    response = requests.get(taxi_page)
    html = response.content
    return html

all_urls = get_all_urls_from_tlc_page(TLC_URL)


b'<!DOCTYPE html>\r\n<html>\n<head>\n<META http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<title>TLC Trip Record Data - TLC</title>\n<!--fixed-layout-->\n<!--ls:begin[stylesheet]-->\n<link href="/iwov-resources/fixed-layout/3-Row Simple.css" type="text/css" rel="stylesheet">\n<!--ls:end[stylesheet]-->\n<!--ls:begin[meta-keywords]-->\n<meta name="keywords" content="">\n<!--ls:end[meta-keywords]-->\n<!--ls:begin[meta-description]-->\n<meta name="description" content="">\n<!--ls:end[meta-description]-->\n<!--ls:begin[custom-meta-data]-->\n<!--ls:end[custom-meta-data]-->\n<!--ls:begin[meta-vpath]-->\n<meta name="vpath" content="">\n<!--ls:end[meta-vpath]-->\n<!--ls:begin[meta-page-locale-name]-->\n<meta name="page-locale-name" content="">\n<!--ls:end[meta-page-locale-name]-->\n<!--\n\t\t\t\t\tls:begin[pre-head-injection]\n\t\t\t\t--><!--\n\t\t\t\t\tls:end[pre-head-injection]\n\t\t\t\t--><!--\n\t\t\t\t\tls:begin[social_media_injection]\n\t\t\t\t--><!--\n\t\t\t\t\tls:end[soci

In [15]:
def filter_parquet_urls(all_urls):
    soup = bs4.BeautifulSoup(all_urls, "html.parser")
    yellow_a_tags = soup.find_all("a", attrs={"title": "Yellow Taxi Trip Records"})
    HVFHV_a_tags = soup.find_all("a", attrs={"title": "High Volume For-Hire Vehicle Trip Records"})
    all_a_tags = yellow_a_tags + HVFHV_a_tags
    return [a["href"] for a in all_a_tags]

all_parquet_urls = filter_parquet_urls(all_urls)



['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-06.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-07.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-08.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-09.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03

In [16]:
import re

### select_parquet(all_urls)

List_of_parquet -> List_of_parquet

select_parquet(all_urls) takes a list of parquets and filter out all parquets not in the Jan 2020 - Aug 2024 period

In [21]:
def select_parquet(all_urls):
    result =[]
    for i in range(len(all_urls)):
        curr_url = all_urls[i]
        pattern = r"(\d{4})-(\d{2})"
        match = re.search(pattern, curr_url)
        year_str = int(match.group(1))
        month_str = int(match.group(2))
        if (year_str < 2020):
            continue
        if (year_str >= 2024 and month_str > 8):
            continue
        result += [curr_url]       
    return result



In [24]:
required_parquests = select_parquet(all_parquet_urls)


['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-06.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-07.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-08.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04

### select_yellow(urls)

List_of_parquet -> List_of_parquet

select_yellow(urls) takes the parquet filterd by select_parquet and returns all the yellow cabs link as a list

In [29]:
def select_yellow(urls):
    result =[]
    for i in range(len(urls)):
        curr_url = urls[i]
        if not isinstance(curr_url, str):
            continue
        pattern = r"yellow"
        match = re.search(pattern, curr_url)
        if match is None:
            continue
        else: 
            result += [curr_url]       
    return result

In [35]:
required_parquests
yellow_cabs = select_yellow(required_parquests)


['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-06.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-07.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-08.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04

### select_fhvhv(urls)

List_of_parquet -> List_of_parquet

Same as select_yellow, but it returns fhvhv links.

In [39]:
def select_fhvhv(urls):
    result =[]
    for i in range(len(urls)):
        curr_url = urls[i]
        if not isinstance(curr_url, str):
            continue
        pattern = r"fhvhv"
        match = re.search(pattern, curr_url)
        if match is None:
            continue
        else: 
            result += [curr_url]       
    return result

In [41]:
required_parquests
fhvhv_cabs = select_fhvhv(required_parquests)


['https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-02.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-03.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-05.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-06.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-07.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-08.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-03.parquet ',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-04.parquet',


### download_one(url, save_name)

Str_of_parquet ->None

Output: 1.message of download result
        2.download a file

download_one(url, save_name) takes a parquet link of yellow cab or fhvhv, then write the data into a local file named save_name. It prints a message if download is successful and raise a http error otherwise

In [44]:
def download_one(url, save_name):
    response = requests.get(url, stream=True)
    response.raise_for_status()  

    with open(save_name, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                file.write(chunk)

    print(f"File downloaded successfully as {save_name}")
    return None


In [46]:
download_one('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet','fhvhv_tripdata_2024-01.parquet')
df = pd.read_parquet('fhvhv_tripdata_2024-01.parquet')
df

File downloaded successfully as fhvhv_tripdata_2024-01.parquet


Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,4.05,2.75,0.0,0.00,40.18,N,N,N,N,N
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,0.89,2.75,0.0,0.00,6.12,N,N,N,N,N
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,1.60,2.75,0.0,0.00,9.47,N,N,N,N,N
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,1.52,2.75,0.0,0.00,11.35,N,N,N,N,N
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,3.43,2.75,0.0,0.00,28.63,N,N,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19663925,HV0003,B03404,B03404,2024-01-31 23:24:46,2024-01-31 23:26:11,2024-01-31 23:28:08,2024-01-31 23:32:13,79,113,0.65,...,0.81,2.75,0.0,1.00,5.39,N,N,N,N,N
19663926,HV0003,B03404,B03404,2024-01-31 23:33:02,2024-01-31 23:34:07,2024-01-31 23:34:19,2024-02-01 00:07:53,113,248,13.32,...,3.19,2.75,0.0,0.00,36.43,N,N,N,N,N
19663927,HV0003,B03404,B03404,2024-01-31 23:28:59,2024-01-31 23:30:51,2024-01-31 23:31:14,2024-01-31 23:38:18,161,50,1.31,...,0.89,2.75,0.0,0.00,5.71,N,N,N,N,N
19663928,HV0003,B03404,B03404,2024-01-31 23:39:00,2024-01-31 23:41:03,2024-01-31 23:41:45,2024-01-31 23:52:40,246,163,1.57,...,1.62,2.75,0.0,4.62,8.54,N,N,N,N,N


### data download

The cells below using for loops to call download_one repeatedly to download all yellow cabs and fhvhv parquet data. It uses url.strip() because sometimes we might have a white space in the end of input parquet link which might cause 403 error

In [137]:
uber_urls = [] #create list of urls for cleanning all the datasets
for i in fhvhv_cabs:
    url = i
    url = url.strip()
    pattern = r"fhvhv_tripdata_\d{4}-\d{2}\.parquet"
    match = re.search(pattern, url)
    save_name = match.group()
    download_one(url, save_name)
    uber_urls.append(save_name)

File downloaded successfully as fhvhv_tripdata_2024-01.parquet
File downloaded successfully as fhvhv_tripdata_2024-02.parquet
File downloaded successfully as fhvhv_tripdata_2024-03.parquet
File downloaded successfully as fhvhv_tripdata_2024-04.parquet
File downloaded successfully as fhvhv_tripdata_2024-05.parquet
File downloaded successfully as fhvhv_tripdata_2024-06.parquet
File downloaded successfully as fhvhv_tripdata_2024-07.parquet
File downloaded successfully as fhvhv_tripdata_2024-08.parquet
File downloaded successfully as fhvhv_tripdata_2023-01.parquet
File downloaded successfully as fhvhv_tripdata_2023-02.parquet
File downloaded successfully as fhvhv_tripdata_2023-03.parquet
File downloaded successfully as fhvhv_tripdata_2023-04.parquet
File downloaded successfully as fhvhv_tripdata_2023-05.parquet
File downloaded successfully as fhvhv_tripdata_2023-06.parquet
File downloaded successfully as fhvhv_tripdata_2023-07.parquet
File downloaded successfully as fhvhv_tripdata_2023-08.

In [139]:
taxi_urls = [] #create list of urls for cleanning all the datasets
for i in yellow_cabs:
    url = i
    url = url.strip()
    pattern = r"yellow_tripdata_\d{4}-\d{2}\.parquet"
    match = re.search(pattern, url)
    save_name = match.group()
    download_one(url, save_name)
    taxi_urls.append(save_name)

File downloaded successfully as yellow_tripdata_2024-01.parquet
File downloaded successfully as yellow_tripdata_2024-02.parquet
File downloaded successfully as yellow_tripdata_2024-03.parquet
File downloaded successfully as yellow_tripdata_2024-04.parquet
File downloaded successfully as yellow_tripdata_2024-05.parquet
File downloaded successfully as yellow_tripdata_2024-06.parquet
File downloaded successfully as yellow_tripdata_2024-07.parquet
File downloaded successfully as yellow_tripdata_2024-08.parquet
File downloaded successfully as yellow_tripdata_2023-01.parquet
File downloaded successfully as yellow_tripdata_2023-02.parquet
File downloaded successfully as yellow_tripdata_2023-03.parquet
File downloaded successfully as yellow_tripdata_2023-04.parquet
File downloaded successfully as yellow_tripdata_2023-05.parquet
File downloaded successfully as yellow_tripdata_2023-06.parquet
File downloaded successfully as yellow_tripdata_2023-07.parquet
File downloaded successfully as yellow_t

In [None]:
taxi_urls

In [143]:
taxi_urls

['yellow_tripdata_2024-01.parquet',
 'yellow_tripdata_2024-02.parquet',
 'yellow_tripdata_2024-03.parquet',
 'yellow_tripdata_2024-04.parquet',
 'yellow_tripdata_2024-05.parquet',
 'yellow_tripdata_2024-06.parquet',
 'yellow_tripdata_2024-07.parquet',
 'yellow_tripdata_2024-08.parquet',
 'yellow_tripdata_2023-01.parquet',
 'yellow_tripdata_2023-02.parquet',
 'yellow_tripdata_2023-03.parquet',
 'yellow_tripdata_2023-04.parquet',
 'yellow_tripdata_2023-05.parquet',
 'yellow_tripdata_2023-06.parquet',
 'yellow_tripdata_2023-07.parquet',
 'yellow_tripdata_2023-08.parquet',
 'yellow_tripdata_2023-09.parquet',
 'yellow_tripdata_2023-10.parquet',
 'yellow_tripdata_2023-11.parquet',
 'yellow_tripdata_2023-12.parquet',
 'yellow_tripdata_2022-01.parquet',
 'yellow_tripdata_2022-02.parquet',
 'yellow_tripdata_2022-03.parquet',
 'yellow_tripdata_2022-04.parquet',
 'yellow_tripdata_2022-05.parquet',
 'yellow_tripdata_2022-06.parquet',
 'yellow_tripdata_2022-07.parquet',
 'yellow_tripdata_2022-08.pa

### Process Taxi Data

In [847]:
def get_and_clean_taxi_month(url):
    '''
    This function does the follows:
        1. Drop rows with null values in all columns
        2. Ensure dropoff_datetime > pickup_datetime > request_datetime
        3. Filter rows with invalid or negative location IDs
        4. Remove rows with zero or negative trip durations
        5. Remove location outside of  (40.560445, -74.242330) and (40.908524, -73.717047).
    '''
    
    # Keep only the necessary columns
    df1=pd.read_parquet(url)
    poplulation = df1.shape[0]
    sample_size = calculate_sample_size(poplulation)
    print(f"population is {poplulation} and calculated sample size is {sample_size}")
    df1 = df1.sample(n=sample_size)
    
    df1.columns = (
        df1.columns
        .str.strip()               
        .str.lower()               
        .str.replace(' ', '_')     
        .str.replace(r'\W+', '_')  
    )
    
    columns_to_keep = [
        "tpep_pickup_datetime", "tpep_dropoff_datetime",
        "trip_distance", "pulocationid", "dolocationid", "fare_amount",	"extra",	
        "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "congestion_surcharge", "airport_fee"
    ]
    yellow_data = df1[columns_to_keep]
    
    yellow_data = yellow_data.rename(columns={
        "vendorid": "trip_id",
        "tpep_pickup_datetime": "pickup_time",
        "tpep_dropoff_datetime": "dropoff_time",
        "pulocationid": "pick_up_location",
        "dolocationid": "drop_off_location"
    })
    #replace NaN with 0 for airport_fee
    yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)

    #make sure pickup_time in proper datetime format
    yellow_data['pickup_time'] = pd.to_datetime(yellow_data['pickup_time'], errors='coerce')
    yellow_data['dropoff_time'] = pd.to_datetime(yellow_data['dropoff_time'], errors='coerce')
                                                                  
    # 1. Drop rows with null values in all columns
    all_taxi_columns = ["pickup_time",	"dropoff_time",	"trip_distance", "pick_up_location","drop_off_location","fare_amount",	"extra",	
         "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount","congestion_surcharge","airport_fee"]
    yellow_data = yellow_data.dropna(subset=all_taxi_columns)
    
    # 2. Ensure dropoff_datetime > pickup_datetime > request_datetime
    yellow_data = yellow_data[
        (yellow_data['dropoff_time'] > yellow_data['pickup_time']) 
    ]
    
    
    # 3. Filter rows with invalid or negative location IDs
    yellow_data = yellow_data[
        (yellow_data['pick_up_location'] > 0 ) &
        (yellow_data['drop_off_location'] > 0 )
    ]

    yellow_data = yellow_data[
        (yellow_data['pick_up_location'] < 264 ) &
        (yellow_data['drop_off_location'] < 264)
    ]
    
    # 4. Remove rows with zero or negative trip durations
    yellow_data = yellow_data[yellow_data['trip_distance'] > 0]
    
    # 5. Remove location outside of  (40.560445, -74.242330) and (40.908524, -73.717047).
    '''
    for index, row in yellow_data.iterrows():
        # get lat,lon using id
        pick_up_coords = lookup_coords_for_taxi_zone_id(row["pick_up_location"], taxi_zones)
        drop_off_coords = lookup_coords_for_taxi_zone_id(row["drop_off_location"], taxi_zones)
        # mutate id with lat and lon
        yellow_data.at[index, "pick_up_latitude"] = pick_up_coords[0]
        yellow_data.at[index, "pick_up_longitude"] = pick_up_coords[1]
        yellow_data.at[index, "drop_off_latitude"] = drop_off_coords[0]
        yellow_data.at[index, "drop_off_longitude"] = drop_off_coords[1]
    '''
    lat_min, lon_min = 40.560445, -74.242330
    lat_max, lon_max = 40.908524, -73.717047
    
    yellow_data["pick_up_coords"] = yellow_data["pick_up_location"].apply(get_coords)
    yellow_data["drop_off_coords"] = yellow_data["drop_off_location"].apply(get_coords)
    
    def is_within_bounding_box(coords):
        lat, lon = coords
        return lat_min <= lat <= lat_max and lon_min <= lon <= lon_max
    
    pick_up_filter = yellow_data["pick_up_coords"].map(is_within_bounding_box)
    drop_off_filter = yellow_data["drop_off_coords"].map(is_within_bounding_box)
    
    yellow_data = yellow_data[pick_up_filter & drop_off_filter]
    
    
    #Coordinates to lat and lon
    def split_coords(coords):
        if isinstance(coords, tuple):
            return coords  
        else:
            lat, lon = coords.strip("()").split(",")
            return float(lat), float(lon)

# Add lon and lat to dataframe
    if "pick_up_coords" in yellow_data.columns and "drop_off_coords" in yellow_data.columns:
        yellow_data[['pick_up_lat', 'pick_up_lon']] = yellow_data['pick_up_coords'].apply(pd.Series)
        yellow_data[['drop_off_lat', 'drop_off_lon']] = yellow_data['drop_off_coords'].apply(pd.Series)
   
        yellow_data = yellow_data.drop(columns=['pick_up_coords', 'drop_off_coords'])
    return yellow_data

a = get_and_clean_taxi_month('yellow_tripdata_2022-12.parquet')
a

population is 3399549 and calculated sample size is 385


Unnamed: 0,pickup_time,dropoff_time,trip_distance,pick_up_location,drop_off_location,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pick_up_lat,pick_up_lon,drop_off_lat,drop_off_lon
2538498,2022-12-22 19:48:10,2022-12-22 19:57:08,1.08,161,237,9.3,2.50,0.5,3.16,0.00,1.0,18.96,2.5,0.00,40.758028,-73.977698,40.768615,-73.965635
2020030,2022-12-17 22:07:16,2022-12-17 22:13:28,1.35,239,238,6.5,0.50,0.5,2.06,0.00,0.3,12.36,2.5,0.00,40.783961,-73.978632,40.791705,-73.973049
1236985,2022-12-11 09:02:01,2022-12-11 09:13:13,3.31,50,166,11.5,0.00,0.5,2.96,0.00,0.3,17.76,2.5,0.00,40.766238,-73.995135,40.809457,-73.961764
1859102,2022-12-16 16:15:44,2022-12-16 16:39:37,1.64,170,141,14.5,1.00,0.5,0.00,0.00,0.3,18.80,2.5,0.00,40.747746,-73.978492,40.766948,-73.959635
1278239,2022-12-11 16:40:46,2022-12-11 17:16:35,13.00,132,155,38.5,1.25,0.5,15.00,0.00,0.3,55.55,0.0,1.25,40.646985,-73.786533,40.614591,-73.915277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870725,2022-12-08 12:33:31,2022-12-08 12:50:01,2.83,231,246,13.5,0.00,0.5,2.00,0.00,0.3,18.80,2.5,0.00,40.717773,-74.007880,40.753309,-74.004015
1535356,2022-12-13 22:47:51,2022-12-13 23:10:22,10.47,132,39,30.0,0.50,0.5,0.00,0.00,0.3,32.55,0.0,1.25,40.646985,-73.786533,40.638037,-73.899735
2928798,2022-12-28 07:15:40,2022-12-28 07:43:56,17.21,229,132,70.0,0.00,0.5,16.11,6.55,1.0,96.66,2.5,0.00,40.756729,-73.965146,40.646985,-73.786533
1529200,2022-12-13 21:19:29,2022-12-13 21:27:36,1.25,48,246,7.5,0.50,0.5,2.26,0.00,0.3,13.56,2.5,0.00,40.762253,-73.989845,40.753309,-74.004015


In [849]:
def get_and_clean_taxi_data(taxi_urls):
    all_taxi_dataframes = []
    for taxi_url in taxi_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_taxi_month(taxi_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        print("Complete cleaning: ", taxi_url)
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)

    return taxi_data

In [851]:
def get_taxi_data():
    all_urls = get_all_urls_from_tlc_page(TLC_URL)
    all_parquet_urls = filter_parquet_urls(all_urls)
    correct_urls = select_parquet(all_parquet_urls)
    taxi_url_new=select_yellow(correct_urls)
    taxi_data = get_and_clean_taxi_data(taxi_url_new)
    return taxi_data

In [854]:
taxi_data = get_taxi_data()

population is 2964624 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet 
population is 3007526 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet 
population is 3582628 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet 
population is 3514289 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet
population is 3723833 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet
population is 3539193 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-06.parquet
population is 3076903 and calculated sample size is 385
Compl

  yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-01.parquet
population is 6299367 and calculated sample size is 385


  yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-02.parquet
population is 3007687 and calculated sample size is 385


  yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-03.parquet
population is 238073 and calculated sample size is 385


  yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-04.parquet
population is 348415 and calculated sample size is 385


  yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-05.parquet
population is 549797 and calculated sample size is 385


  yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-06.parquet
population is 800412 and calculated sample size is 385


  yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-07.parquet
population is 1007286 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-08.parquet
population is 1341017 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-09.parquet
population is 1681132 and calculated sample size is 385


  yellow_data['airport_fee'] = yellow_data['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-10.parquet
population is 1509000 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-11.parquet
population is 1461898 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-12.parquet


In [858]:
taxi_data.head()

Unnamed: 0,pickup_time,dropoff_time,trip_distance,pick_up_location,drop_off_location,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pick_up_lat,pick_up_lon,drop_off_lat,drop_off_lon
1104177,2024-01-13 18:47:36,2024-01-13 19:02:36,1.9,164,148,14.2,0.0,0.5,3.64,0.0,1.0,21.84,2.5,0.0,40.748575,-73.985156,40.718938,-73.990896
421296,2024-01-05 23:45:02,2024-01-05 23:54:52,0.93,170,163,10.0,1.0,0.5,3.0,0.0,1.0,18.0,2.5,0.0,40.747746,-73.978492,40.764421,-73.977569
1917164,2024-01-22 16:06:25,2024-01-22 16:49:32,9.14,236,33,46.4,2.5,0.5,10.58,0.0,1.0,63.48,2.5,0.0,40.780436,-73.957012,40.695798,-73.99525
989878,2024-01-12 15:10:02,2024-01-12 15:44:01,8.69,151,138,40.8,5.0,0.5,8.14,6.94,1.0,62.38,0.0,0.0,40.797962,-73.968168,40.774376,-73.873629
2508727,2024-01-28 14:34:49,2024-01-28 15:07:29,3.46,211,230,28.9,0.0,0.5,6.58,0.0,1.0,39.48,2.5,0.0,40.723888,-74.001538,40.759818,-73.984196


In [862]:
taxi_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19818 entries, 1104177 to 147290
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   pickup_time            19818 non-null  datetime64[us]
 1   dropoff_time           19818 non-null  datetime64[us]
 2   trip_distance          19818 non-null  float64       
 3   pick_up_location       19818 non-null  int64         
 4   drop_off_location      19818 non-null  int64         
 5   fare_amount            19818 non-null  float64       
 6   extra                  19818 non-null  float64       
 7   mta_tax                19818 non-null  float64       
 8   tip_amount             19818 non-null  float64       
 9   tolls_amount           19818 non-null  float64       
 10  improvement_surcharge  19818 non-null  float64       
 11  total_amount           19818 non-null  float64       
 12  congestion_surcharge   19818 non-null  float64       
 13 

In [864]:
taxi_data.describe()

Unnamed: 0,pickup_time,dropoff_time,trip_distance,pick_up_location,drop_off_location,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pick_up_lat,pick_up_lon,drop_off_lat,drop_off_lon
count,19818,19818,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0,19818.0
mean,2022-05-01 00:17:33.181300,2022-05-01 00:33:55.081138,3.150263,164.461096,161.757998,14.747801,1.266477,0.492209,2.722607,0.411657,0.544611,21.870674,2.296397,0.082476,40.753997,-73.967684,40.756018,-73.971564
min,2002-10-27 09:42:30,2002-10-27 09:47:20,0.01,1.0,1.0,-89.69,-5.0,-0.5,0.0,-6.94,-1.0,-87.5,-2.5,-1.75,40.580922,-74.174,40.576961,-74.174
25%,2021-03-09 19:39:14,2021-03-09 19:45:19,1.08,132.0,107.0,7.2,0.0,0.5,0.0,0.0,0.3,12.6,2.5,0.0,40.740439,-73.989845,40.740337,-73.989845
50%,2022-05-04 05:00:41.500000,2022-05-04 05:23:06.500000,1.8,162.0,162.0,10.5,1.0,0.5,2.2,0.0,0.3,16.6,2.5,0.0,40.758028,-73.977698,40.758028,-73.977698
75%,2023-06-21 16:43:57.250000,2023-06-21 16:51:47.250000,3.22,234.0,234.0,16.5,2.5,0.5,3.46,0.0,1.0,23.8,2.5,0.0,40.773633,-73.965146,40.775932,-73.959635
max,2024-08-31 23:46:17,2024-08-31 23:57:03,60.6,263.0,263.0,159.5,11.75,0.8,110.0,45.38,1.0,219.41,2.5,1.75,40.897932,-73.739337,40.899529,-73.735554
std,,,3.935795,64.735904,70.26858,13.528367,1.515005,0.081608,3.192514,1.772398,0.349763,17.387201,0.729325,0.347521,0.030691,0.044261,0.03193,0.03479


### Processing Uber Data

In [855]:
def get_and_clean_uber_month(url):

    df_uber=pd.read_parquet(url)
    poplulation = df_uber.shape[0]
    sample_size = calculate_sample_size(poplulation)
    print(f"population is {poplulation} and calculated sample size is {sample_size}")
    df_uber = df_uber.sample(n=sample_size)
    
    # Define columns to keep
    uber_columns = [
        "hvfhs_license_num", 
         "pickup_datetime",
        "dropoff_datetime", "PULocationID", "DOLocationID", "trip_miles",	
        "sales_tax", "congestion_surcharge", "airport_fee",	"tips",	"driver_pay","base_passenger_fare", 
        "tolls"
    ]
    # Keep only the necessary columns
    uber_cleaned1 = df_uber[uber_columns]
    uber_cleaned = uber_cleaned1[uber_cleaned1['hvfhs_license_num'] == 'HV0003']
    uber_cleaned.columns = (
        uber_cleaned.columns
        .str.strip()               # Remove leading/trailing spaces
        .str.lower()               # Convert to lowercase
        .str.replace(' ', '_')     # Replace spaces with underscores
        .str.replace(r'\W+', '_')  # Replace non-word characters with underscores
    )
    uber_cleaned = uber_cleaned.rename(columns={
        "hvfhs_license_num": "uber_license_num",
        "pulocationid": "pick_up_location",
        "dolocationid": "drop_off_location"
    })
    
    uber_cleaned['airport_fee'] =uber_cleaned['airport_fee'].fillna(0)
    
    #Remove invalid datapoints
    # 1. Drop rows with null values in all columns
    all_columns = ["uber_license_num",	"pickup_datetime", 
                   "dropoff_datetime", "pick_up_location", "drop_off_location","trip_miles",	
                    "sales_tax", "congestion_surcharge", "airport_fee",	"tips",	"driver_pay","base_passenger_fare", "tolls"]
    uber_cleaned = uber_cleaned.dropna(subset=all_columns)
    
    # 2. Ensure dropoff_datetime > pickup_datetime > request_datetime
    uber_cleaned = uber_cleaned[
        (uber_cleaned['dropoff_datetime'] > uber_cleaned['pickup_datetime']) 
    ]
    
    # 3. Filter rows with invalid or negative location IDs
    uber_cleaned = uber_cleaned[
        (uber_cleaned['pick_up_location'] > 0) &
        (uber_cleaned['drop_off_location'] > 0)
    ]

    uber_cleaned = uber_cleaned[
        (uber_cleaned['pick_up_location'] < 264) &
        (uber_cleaned['drop_off_location'] < 264)
    ]

    # 4. Remove rows with zero or negative trip durations
    uber_cleaned['trip_duration'] = (
        pd.to_datetime(uber_cleaned['dropoff_datetime']) - pd.to_datetime(uber_cleaned['pickup_datetime'])
    ).dt.total_seconds()
    uber_cleaned = uber_cleaned[uber_cleaned['trip_duration'] > 0]

    # 5. Remove location outside of  (40.560445, -74.242330) and (40.908524, -73.717047).
    '''
    for index, row in uber_cleaned.iterrows():
        # get lat,lon using id
        pick_up_coords = lookup_coords_for_taxi_zone_id(row["pick_up_location"], taxi_zones)
        drop_off_coords = lookup_coords_for_taxi_zone_id(row["drop_off_location"], taxi_zones)
        # mutate id with lat and lon
        uber_cleaned.at[index, "pick_up_latitude"] = pick_up_coords[0]
        uber_cleaned.at[index, "pick_up_longitude"] = pick_up_coords[1]
        uber_cleaned.at[index, "drop_off_latitude"] = drop_off_coords[0]
        uber_cleaned.at[index, "drop_off_longitude"] = drop_off_coords[1]
    '''
    lat_min, lon_min = 40.560445, -74.242330
    lat_max, lon_max = 40.908524, -73.717047
    
    def is_within_bounding_box(coords):
        lat, lon = coords
        return lat_min <= lat <= lat_max and lon_min <= lon <= lon_max
    uber_cleaned["pick_up_coords"] = uber_cleaned["pick_up_location"].apply(get_coords)
    uber_cleaned["drop_off_coords"] = uber_cleaned["drop_off_location"].apply(get_coords)

    pick_up_filter = uber_cleaned["pick_up_coords"].map(is_within_bounding_box)
    drop_off_filter = uber_cleaned["drop_off_coords"].map(is_within_bounding_box)

    #Coordinates to lat and lon
    def split_coords(coords):
        if isinstance(coords, tuple):
            return coords  
        else:
            lat, lon = coords.strip("()").split(",")
            return float(lat), float(lon)

    # Add lon and lat to dataframe
    uber_cleaned[['pick_up_lat', 'pick_up_lon']] = uber_cleaned['pick_up_coords'].apply(pd.Series)
    uber_cleaned[['drop_off_lat', 'drop_off_lon']] = uber_cleaned['drop_off_coords'].apply(pd.Series)
    uber_cleaned = uber_cleaned.drop(columns=['pick_up_coords', 'drop_off_coords'])
    
    return uber_cleaned







In [511]:
def get_and_clean_uber_data(uber_urls):
    all_uber_dataframes = []
    
    for uber_url in uber_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_uber_month(uber_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_uber_dataframes.append(dataframe)
        print("Complete cleaning: ", uber_url)
        
    # create one gigantic dataframe with data from every month needed
    uber_data = pd.concat(all_uber_dataframes)
    return uber_data

In [513]:
def load_and_clean_uber_data():
    raise NotImplementedError()

In [515]:
def get_uber_data():
    all_urls = get_all_urls_from_tlc_page(TLC_URL)
    all_parquet_urls = filter_parquet_urls(all_urls)
    correct_urls = select_parquet(all_parquet_urls)
    uber_url_new=select_fhvhv(correct_urls)
    uber_data = get_and_clean_uber_data(uber_url_new)
    return uber_data

In [517]:
uber_data = get_uber_data()

population is 19663930 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet 
population is 19359148 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-02.parquet 
population is 21280788 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-03.parquet 
population is 19733038 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-04.parquet
population is 20704538 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-05.parquet
population is 20123226 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-06.parquet
population is 19182934 and calculated sample size is 385
Comp

  uber_cleaned['airport_fee'] =uber_cleaned['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-04.parquet
population is 6089999 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-05.parquet
population is 7555193 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-06.parquet
population is 9958454 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-07.parquet
population is 11096852 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-08.parquet
population is 12106669 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-09.parquet
population is 13268411 and calculated sample size is 385


  uber_cleaned['airport_fee'] =uber_cleaned['airport_fee'].fillna(0)


Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-10.parquet
population is 11596865 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-11.parquet
population is 11637123 and calculated sample size is 385
Complete cleaning:  https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-12.parquet


In [519]:
uber_data.head()

Unnamed: 0,uber_license_num,pickup_datetime,dropoff_datetime,pick_up_location,drop_off_location,trip_miles,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,base_passenger_fare,tolls,trip_duration,pick_up_lat,pick_up_lon,drop_off_lat,drop_off_lon
0,HV0003,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,4.05,2.75,0.0,0.0,40.18,45.61,0.0,2251.0,40.758028,-73.977698,40.735035,-74.008984
1,HV0003,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,0.89,2.75,0.0,0.0,6.12,10.05,0.0,432.0,40.740439,-73.976495,40.72762,-73.985937
2,HV0003,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,1.6,2.75,0.0,0.0,9.47,18.07,0.0,731.0,40.72762,-73.985937,40.748497,-73.992438
3,HV0003,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,1.52,2.75,0.0,0.0,11.35,17.17,0.0,930.0,40.740337,-73.990458,40.718938,-73.990896
4,HV0003,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,3.43,2.75,0.0,0.0,28.63,38.67,0.0,761.0,40.718938,-73.990896,40.690787,-73.974882


In [521]:
uber_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15663 entries, 0 to 384
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   uber_license_num      15663 non-null  object        
 1   pickup_datetime       15663 non-null  datetime64[us]
 2   dropoff_datetime      15663 non-null  datetime64[us]
 3   pick_up_location      15663 non-null  int64         
 4   drop_off_location     15663 non-null  int64         
 5   trip_miles            15663 non-null  float64       
 6   sales_tax             15663 non-null  float64       
 7   congestion_surcharge  15663 non-null  float64       
 8   airport_fee           15663 non-null  float64       
 9   tips                  15663 non-null  float64       
 10  driver_pay            15663 non-null  float64       
 11  base_passenger_fare   15663 non-null  float64       
 12  tolls                 15663 non-null  float64       
 13  trip_duration         1

In [523]:
uber_data.describe()

Unnamed: 0,pickup_datetime,dropoff_datetime,pick_up_location,drop_off_location,trip_miles,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,base_passenger_fare,tolls,trip_duration,pick_up_lat,pick_up_lon,drop_off_lat,drop_off_lon
count,15663,15663,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0,15663.0
mean,2022-04-26 00:06:51.749856,2022-04-26 00:21:53.200408,138.57473,133.236609,4.413643,1.819363,1.130531,0.056822,0.63092,15.818875,20.0589,0.451337,901.450552,40.728487,-73.933976,40.724443,-73.918747
min,2020-01-01 00:00:54,2020-01-01 00:09:16,3.0,3.0,0.01,0.0,0.0,0.0,0.0,0.0,-15.7,0.0,78.0,0.0,-74.233534,0.0,-74.233534
25%,2021-02-01 00:57:45.500000,2021-02-01 01:11:50.500000,76.0,68.0,1.63,0.92,0.0,0.0,0.0,8.17,10.46,0.0,503.0,40.694542,-73.985937,40.688168,-73.983025
50%,2022-05-01 00:36:18,2022-05-01 00:51:49,140.0,130.0,3.03,1.48,0.0,0.0,0.0,13.03,16.63,0.0,784.0,40.731821,-73.957012,40.729506,-73.947442
75%,2023-07-01 00:26:54.500000,2023-07-01 00:42:53,211.0,206.0,5.735,2.32,2.75,0.0,0.0,20.38,25.665,0.0,1171.0,40.765484,-73.913632,40.775932,-73.905408
max,2024-08-01 00:59:53,2024-08-01 01:41:24,263.0,263.0,55.88,16.34,2.75,5.0,31.79,127.26,177.57,43.91,7723.0,40.899529,0.0,40.899529,0.0
std,,,75.21596,77.233536,4.07934,1.256872,1.345968,0.373408,1.975997,10.457279,13.298604,1.792293,540.866299,0.464588,0.83788,0.654543,1.183059


### Processing Weather Data

In [757]:
weather_paths = [
    '2020_weather.csv',
    '2021_weather.csv',
    '2022_weather.csv',
    '2023_weather.csv',
    '2024_weather.csv'
]

def get_all_weather_csvs(directory):
    all_dataframes = []
    for file_path in directory:
        df = pd.read_csv(file_path)
        all_dataframes.append(df)
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    return combined_df



In [759]:
weather_paths = [
    '2020_weather.csv',
    '2021_weather.csv',
    '2022_weather.csv',
    '2023_weather.csv',
    '2024_weather.csv'
]
def clean_month_weather_data_hourly(weather_paths):
    '''
    This function does the follows:
        1. call get_all_weather_csvs(weather_paths) to obtain a big df
        2. remove all columns other than 'DATE', 'HourlyPrecipitation', 'HourlyWindSpeed'
        3. convert all input of 'HourlyPrecipitation', 'HourlyWindSpeed' into numeric values we use errors='coerce' here
            because we have "T" as trace amount, we will record it as 0 here
    '''
    weather_df_hours = get_all_weather_csvs(weather_paths)
    relevant_columns = ['DATE', 'HourlyPrecipitation', 'HourlyWindSpeed']
    weather_df_hours = weather_df_hours[relevant_columns]
    
    weather_df_hours['HourlyPrecipitation'] = pd.to_numeric(weather_df_hours['HourlyPrecipitation'], errors='coerce')
    weather_df_hours['HourlyWindSpeed'] = pd.to_numeric(weather_df_hours['HourlyWindSpeed'], errors='coerce')
    weather_df_hours.columns = (
        weather_df_hours.columns
        .str.strip()               # Remove leading/trailing spaces
        .str.lower()               # Convert to lowercase
        .str.replace(' ', '_')     # Replace spaces with underscores
        .str.replace(r'\W+', '_')  # Replace non-word characters with underscores
    )
    return weather_df_hours

In [761]:
def clean_month_weather_data_daily(hourly_data):
    """
    This function does the follows:
    1. change all column values of Date into datetime
    2. all values are numeric since it is gathered by hourly_data using previous hourly clean function
    3. merge all rows in the same day and use average of precip and windspeed as new value, if such value is NaN, it will not be covered in denom
        i.e. if we have 24 NaN the avg is NaN, if we have 23 NaN and a 1, the avg is 1
    """
    daily_records = []
    
    hourly_data['date'] = pd.to_datetime(hourly_data['date'])
    

    for date, group in hourly_data.groupby(hourly_data['date'].dt.date):
        avg_precipitation = group['hourlyprecipitation'].mean() if group['hourlyprecipitation'].notna().sum() >= 0 else float('nan')
        avg_wind_speed = group['hourlywindspeed'].mean() if group['hourlywindspeed'].notna().sum() >= 0 else float('nan')
        total_snowfall = (
            group['dailysnowfall'].sum() if 'dailysnowfall' in group.columns and group['dailysnowfall'].notna().sum() > 0 else float('nan')
        )
        daily_records.append({
            'date': date,
            'average_precipitation': avg_precipitation,
            'average_wind_speed': avg_wind_speed,
            'total_snowfall':total_snowfall
        })

    daily_data = pd.DataFrame(daily_records)
    daily_data.columns = (
        daily_data.columns
        .str.strip()               # Remove leading/trailing spaces
        .str.lower()               # Convert to lowercase
        .str.replace(' ', '_')     # Replace spaces with underscores
        .str.replace(r'\W+', '_')  # Replace non-word characters with underscores
    )
    return daily_data

In [763]:
def load_and_clean_weather_data():
    weather_paths = [
    '2020_weather.csv',
    '2021_weather.csv',
    '2022_weather.csv',
    '2023_weather.csv']
    weather_df= get_all_weather_csvs(weather_paths)
    weather_df_hours = clean_month_weather_data_hourly(weather_paths)
    weather_df_days = clean_month_weather_data_daily(weather_df_hours)
    return weather_df_hours, weather_df_days

In [765]:
'''
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data
'''

'\ndef load_and_clean_weather_data():\n    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)\n    \n    hourly_dataframes = []\n    daily_dataframes = []\n        \n    for csv_file in weather_csv_files:\n        hourly_dataframe = clean_month_weather_data_hourly(csv_file)\n        daily_dataframe = clean_month_weather_data_daily(csv_file)\n        hourly_dataframes.append(hourly_dataframe)\n        daily_dataframes.append(daily_dataframe)\n        \n    # create two dataframes with hourly & daily data from every month\n    hourly_data = pd.concat(hourly_dataframes)\n    daily_data = pd.concat(daily_dataframes)\n    \n    return hourly_data, daily_data\n'

In [767]:
hourly_weather, daily_weather = load_and_clean_weather_data()

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


In [768]:
hourly_weather.head()

Unnamed: 0,date,hourlyprecipitation,hourlywindspeed
0,2020-01-01 00:51:00,0.0,8.0
1,2020-01-01 01:51:00,0.0,8.0
2,2020-01-01 02:51:00,0.0,14.0
3,2020-01-01 03:51:00,0.0,11.0
4,2020-01-01 04:51:00,0.0,6.0


In [769]:
hourly_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46691 entries, 0 to 46690
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date                 46691 non-null  datetime64[ns]
 1   hourlyprecipitation  36440 non-null  float64       
 2   hourlywindspeed      41420 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 1.1 MB


In [773]:
hourly_weather.describe()

Unnamed: 0,date,hourlyprecipitation,hourlywindspeed
count,46691,36440.0,41420.0
mean,2022-01-03 15:32:30.240945664,0.013248,5.165476
min,2020-01-01 00:51:00,0.0,0.0
25%,2021-01-03 17:57:30,0.0,3.0
50%,2022-01-03 10:51:00,0.0,5.0
75%,2023-01-05 09:12:00,0.0,7.0
max,2023-12-31 23:51:00,3.47,2237.0
std,,0.064017,15.956678


In [775]:
daily_weather.head()

Unnamed: 0,date,average_precipitation,average_wind_speed,total_snowfall
0,2020-01-01,0.0,8.458333,
1,2020-01-02,0.0,5.5,
2,2020-01-03,0.008077,3.305556,
3,2020-01-04,0.017941,3.421053,
4,2020-01-05,0.0,11.333333,


In [777]:
daily_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1461 non-null   object 
 1   average_precipitation  1459 non-null   float64
 2   average_wind_speed     1408 non-null   float64
 3   total_snowfall         0 non-null      float64
dtypes: float64(3), object(1)
memory usage: 45.8+ KB


In [779]:
daily_weather.describe()

Unnamed: 0,average_precipitation,average_wind_speed,total_snowfall
count,1459.0,1408.0,0.0
mean,0.00982,5.156086,
std,0.02633,4.193661,
min,0.0,0.447368,
25%,0.0,3.198611,
50%,0.0,4.697322,
75%,0.004702,6.430124,
max,0.355333,108.227273,


## Part 2: Storing Cleaned Data

In [894]:
engine = db.create_engine(DATABASE_URL)

In [897]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
CREATE TABLE IF NOT EXISTS hourly_weather (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    date DATETIME NOT NULL,
    hourlyprecipitation FLOAT,
    hourlywindspeed FLOAT
);
"""

DAILY_WEATHER_SCHEMA = """
CREATE TABLE IF NOT EXISTS hourly_weather (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    date DATE NOT NULL,
    average_precipitation FLOAT,
    average_wind_speed FLOAT,
    total_snowfall FLOAT
);
"""

TAXI_TRIPS_SCHEMA = """
CREATE TABLE IF NOT EXISTS taxi_trips (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    pickup_time DATETIME NOT NULL,
    dropoff_time DATETIME NOT NULL,
    trip_distance FLOAT,
    pick_up_location INT,
    drop_off_location INT,
    fare_amount FLOAT,
    extra FLOAT,
    mta_tax FLOAT,
    tip_amount FLOAT,
    tolls_amount FLOAT,
    improvement_surcharge FLOAT,
    total_amount FLOAT,
    congestion_surcharge FLOAT,
    airport_fee FLOAT,
    pick_up_lat FLOAT,
    pick_up_lon FLOAT,
    drop_off_lat FLOAT,
    drop_off_lon FLOAT
);
"""

UBER_TRIPS_SCHEMA = """
CREATE TABLE IF NOT EXISTS uber_trips (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    uber_license_num STRING,
    pickup_datetime DATETIME NOT NULL,
    dropoff_datetime DATETIME NOT NULL,
    pick_up_location INT,
    drop_off_location INT,
    trip_miles FLOAT,
    sales_tax FLOAT,
    congestion_surcharge FLOAT,
    airport_fee FLOAT,
    tips FLOAT,
    driver_pay FLOAT,
    base_passenger_fare FLOAT,
    tolls FLOAT,
    trip_duration FLOAT,
    pick_up_lat FLOAT,
    pick_up_lon FLOAT,
    drop_off_lat FLOAT,
    drop_off_lon FLOAT
  
);
"""

In [899]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [901]:
print(engine.url.database)

project_1.db


In [903]:
# create tables with the schema files
with engine.connect() as connection:
    with open(DATABASE_SCHEMA_FILE, "r") as schema_file:
        schema_script = schema_file.read()
        statements = schema_script.split(";")  
        for statement in statements:
            statement = statement.strip()
            #Ignore empty statement
            if statement: 
                connection.execute(text(statement))

print("Databse created successfully.")

Databse created successfully.


### Add Data to Database

In [906]:
# writes the dataframes to the SQL tables
def write_dataframes_to_table(table_to_df_dict, engine):
    with engine.connect() as connection:  
        for table_name, dataframe in table_to_df_dict.items():
            print(f"Dataframe wrote to table: {table_name}")
            dataframe.to_sql(table_name, con=connection, if_exists="append", index=False)

In [908]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_weather,
    "daily_weather": daily_weather
}

In [910]:
write_dataframes_to_table(map_table_name_to_dataframe,engine)

Dataframe wrote to table: taxi_trips
Dataframe wrote to table: uber_trips
Dataframe wrote to table: hourly_weather
Dataframe wrote to table: daily_weather


## Part 3: Understanding the Data

In [913]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    with open(outfile, 'w') as file:
        file.write(query)

### Query 1
#### Q: What is the most popular hour to take taxi?
#### A: The most popular hour to take taxi is 18:00.

In [925]:
#Query file to analysis the result of most popular hour
QUERY_1_FILENAME = "taxi_most_popular_hour.sql"
QUERY_1 = """
SELECT 
    STRFTIME('%H', pickup_time) AS hour_of_day,
    COUNT(*) AS trip_count
FROM 
    taxi_trips
WHERE 
    pickup_time BETWEEN '2020-01-01' AND '2024-08-31'
GROUP BY 
    hour_of_day
ORDER BY 
    trip_count DESC;
"""


In [931]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results
# or via pandas
pd.read_sql(QUERY_1, con=engine)


Unnamed: 0,hour_of_day,trip_count
0,18,1397
1,17,1364
2,15,1298
3,14,1298
4,16,1270
5,19,1258
6,12,1253
7,13,1200
8,20,1060
9,11,1028


In [937]:
write_query_to_file(QUERY_1, f"{QUERY_DIRECTORY}/{QUERY_1_FILENAME}")

### Query 2
#### Q: What is the most popular day of the week to take uber?
#### A: The most popular day of week to take uber is Wednesday.

In [1014]:
QUERY_2_FILENAME = "uber_most_popular_day.sql"
QUERY_2 = """
SELECT
    STRFTIME('%w', pickup_datetime) AS day_of_week,
    COUNT(*) AS trip_count
FROM
    uber_trips
GROUP BY
    day_of_week
ORDER BY
    trip_count DESC;
"""

In [1016]:
with engine.connect() as con:
    results = con.execute(db.text(QUERY_2)).fetchall()
results
# or via pandas
print("The day starts from 0 to 6 which indicates Sunday to Saturday.")
pd.read_sql(QUERY_2, con=engine)

# The day starts from 0 to 6 which indicates Sunday to Saturday.

The day starts from 0 to 6 which indicates Sunday to Saturday.


Unnamed: 0,day_of_week,trip_count
0,3,2807
1,1,2494
2,6,2280
3,5,2214
4,4,2209
5,2,1982
6,0,1677


In [1018]:
write_query_to_file(QUERY_2, f"{QUERY_DIRECTORY}/{QUERY_2_FILENAME}")

### Query 3
#### Q: What’s the 95% percentile of trip distance in January 2024?
#### A:The 95% percentile of trip distance in January 2024 is 11.72.

In [1021]:
QUERY_3_FILENAME = "trip_distance_Jan2024.sql"
QUERY_3 = """
WITH rides_data AS (
    SELECT trip_distance
    FROM taxi_trips
    WHERE pickup_time BETWEEN '2024-01-01' AND '2024-01-31'
    UNION ALL
    SELECT trip_miles AS trip_distance
    FROM uber_trips
    WHERE pickup_datetime BETWEEN '2024-01-01' AND '2024-01-31'
),
ordered_distances AS (
    SELECT trip_distance
    FROM rides_data
    ORDER BY trip_distance
),
percentile AS (
    SELECT CAST((COUNT(*) - 1) * 0.95 AS INTEGER) AS position
    FROM ordered_distances
)
SELECT trip_distance AS p95_trip_distance
FROM ordered_distances
LIMIT 1
OFFSET (SELECT position FROM percentile);
"""

In [1023]:
with engine.connect() as con:
    results = con.execute(db.text(QUERY_3)).fetchall()
q3_result = results[0][0]  
print(q3_result)


11.72


In [1025]:
write_query_to_file(QUERY_3, f"{QUERY_DIRECTORY}/{QUERY_3_FILENAME}")

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)