# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1uAUJGEUzfNj6OsWNAimnYCw7eKaHhMUfU1MTj9YwYw4/edit?usp=sharing), [grading rubric](https://docs.google.com/document/d/1hKuRWqFcIdhOkow3Nljcm7PXzIkoa9c_aHkMKZDxWa0/edit?usp=sharing)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an outline to help you with your own approach.**_

## Project Setup

In [None]:
# all import statements needed for the project
import math
import os

import bs4
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
from geopy.distance import distance
import requests
import re
import sqlalchemy as db

In [None]:
TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
TAXI_DIR = "data/taxi"

TAXI_ZONES_DIR = "data/taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_CSV = "data/uber_rides_sample.csv"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [None]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
    print(f"Folder {QUERY_DIRECTORY} created successfully.")
except FileExistsError:
    print(f"Folder {QUERY_DIRECTORY} already exists.")

In [None]:
# Make sure the TAXI_DIR exists
try:
    os.mkdir(TAXI_DIR)
    print(f"Folder {TAXI_DIR} created successfully.")
except FileExistsError:
    print(f"Folder {TAXI_DIR} already exists.")

## Part 1: Data Preprocessing

### Load Taxi Zones

In [None]:
def load_taxi_zones(shapefile: str) -> pd.DataFrame:
    """Load the shapefile and return it as a DataFrame."""
    loaded_taxi_zones = gpd.read_file(shapefile)
    loaded_taxi_zones = loaded_taxi_zones[['OBJECTID', 'geometry']].set_index('OBJECTID')
    # Transform geometries to the new coordinate reference system 4326
    loaded_taxi_zones = loaded_taxi_zones.to_crs(CRS)
    
    return loaded_taxi_zones

In [None]:
def lookup_coords_for_taxi_zone_id(zone_loc_id: int,
                                   loaded_taxi_zones: pd.DataFrame) -> tuple:
    """Given the zone ID and return the corresponding centroid coordinates."""
    geometry = loaded_taxi_zones.loc[zone_loc_id, 'geometry']
    # Obtain the approximate coordinates by the centroid location
    longitude = geometry.centroid.x
    latitude = geometry.centroid.y
    coords = (longitude, latitude)
    
    return coords

### Calculate distance

In [None]:
def calculate_distance_with_coords(from_coord: tuple, to_coord: tuple) -> float:
    """Given the coordinates and return the distance between them.
    
    This function utilizes the Haversine formula to calculate the distance
    between two coordinates on Earth's surface.
    
    Parameters
    ----------
    from_coord : tuple of float
        A tuple containing the longitude and latitude of the starting point, expressed in degrees.
    to_coord : tuple of float
        A tuple containing the longitude and latitude of the destination point, expressed in degrees.
    
    Returns
    -------
    float
        The distance between the two coordinates, in kilometers.
        
    """
    
    # Convert the input coordinates from degrees to radians
    from_lon, from_lat = math.radians(from_coord[0]), math.radians(from_coord[1])
    to_lon, to_lat = math.radians(to_coord[0]), math.radians(to_coord[1])
    # Calculate the differences in latitude and longitude
    delta_lon = to_lon - from_lon
    delta_lat = to_lat - from_lat
    # Apply the Haversine formula to calculate the distance
    a = math.sin(delta_lat / 2)**2 + math.cos(from_lat) * math.cos(to_lat) * math.sin(delta_lon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = 6371 * c # earth's radius is assumed to be 6371 kilometers
    
    return distance

In [None]:
def add_distance_column(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Add the 'distance' column to the dataframe."""
    dataframe['distance'] = dataframe.apply(lambda x:calculate_distance_with_coords(
                                (x['pickup_longitude'],x['pickup_latitude']),
                                (x['dropoff_longitude'],x['dropoff_latitude'])), axis=1)
    
    return dataframe

### Processing Taxi Data

In [None]:
def get_all_urls_from_taxi_page(taxi_page: str) -> list:
    """Scrap the URLs from the given page and return them as a list."""
    all_urls = list()
    
    content = requests.get(TAXI_URL)
    soup = bs4.BeautifulSoup(content.text, 'lxml')
    # Find all the URLs in the page
    for link in soup.find_all("a"):
        all_urls.append(link.get('href'))
        
    return all_urls

In [None]:
def filter_taxi_parquet_urls(all_urls: list) -> list:
    """Find the URLs for yellow taxi data and return them as a list."""
    all_parquet_urls = list()
    
    pattern = r".*yellow_tripdata.*parquet\Z"
    time_pattern = r"(2009-(0[1-9]|1[0-2]))|(201[0-4]-(0[1-9]|1[0-2]))|(2015-(0[1-6]))"
    for url in all_urls:
        # Check if the URL belongs to yellow taxi trip data
        if re.search(pattern, url):
            # Check if the URL belongs to the time range for the project
            if re.search(time_pattern, url):
                all_parquet_urls.append(url)
            
    return all_parquet_urls

In [None]:
def remove_outside_trip(df: pd.DataFrame) -> pd.DataFrame:
    """Remove the trip records outside the defined region."""
    # Obtain the coordinate limits
    southlimit, westlimit = NEW_YORK_BOX_COORDS[0]
    northlimit, eastlimit = NEW_YORK_BOX_COORDS[1]
    # Remove the trips outside the location 
    df = df[(df['pickup_longitude'] >= westlimit) & (df['pickup_longitude'] <= eastlimit)]
    df = df[(df['pickup_latitude'] >= southlimit) & (df['pickup_latitude'] <= northlimit)]

    df = df[(df['dropoff_longitude'] >= westlimit) & (df['dropoff_longitude'] <= eastlimit)]
    df = df[(df['dropoff_latitude'] >= southlimit) & (df['dropoff_latitude'] <= northlimit)]
    
    return df

In [None]:
def generate_coords_from_zones(dataframe):
    """Generate the coordinates from zone IDs in a DataFrame."""
    loaded_taxi_zones = load_taxi_zones(TAXI_ZONES_SHAPEFILE)
    southlimit, westlimit = NEW_YORK_BOX_COORDS[0]
    northlimit, eastlimit = NEW_YORK_BOX_COORDS[1]
    
    for index, row in dataframe.iterrows():
        pickup_zoneid = row['pickup_zoneid']
        dropoff_zoneid = row['dropoff_zoneid']

        pickup_coords = lookup_coords_for_taxi_zone_id(pickup_zoneid, loaded_taxi_zones)
        
        # define the initial bearing
        direction = 0
        # check if pickup and dropoff zones are the same
        if pickup_zoneid == dropoff_zoneid:
            # generate dropoff coordinates using distance and bearing
            dropoff_coords = distance(
                miles=row['trip_distance']).destination(pickup_coords[::-1], bearing=direction)[1::-1]
        else:
            # generate dropoff coordinates using dropoff zone ID
            dropoff_coords = lookup_coords_for_taxi_zone_id(dropoff_zoneid, loaded_taxi_zones)
        
        # check if dropoff coordinates fall outside the defined box
        while not ((westlimit <= dropoff_coords[0] <= eastlimit) and 
                   (southlimit <= dropoff_coords[1] <= northlimit)):
            # Generate new dropoff coordinates by changing the bearing
            direction += 90
            # If all four bearings do not work, drop this record instead
            if direction == 360:
                break
            dropoff_coords = distance(
                miles=row['trip_distance']).destination(pickup_coords, bearing=direction)[1::-1]
        
        # update the dataframe with the generated coordinates
        if direction != 360:
            dataframe.loc[index, 'pickup_longitude'] = pickup_coords[0]
            dataframe.loc[index, 'pickup_latitude'] = pickup_coords[1]
            dataframe.loc[index, 'dropoff_longitude'] = dropoff_coords[0]
            dataframe.loc[index, 'dropoff_latitude'] = dropoff_coords[1]
        else:
            dataframe.drop(index=index, inplace=True)
    
    # Drop the unnecessary columns
    dataframe.drop(['trip_distance', 'pickup_zoneid', 'dropoff_zoneid'], axis=1, inplace=True)
    
    return dataframe

In [None]:
def clean_taxi_df_2009_to_2010(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Clean the yellow taxi data from 2009 to 2010."""
    # Normalize the column names
    dataframe.columns = ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
                         'trip_distance', 'pickup_longitude', 'pickup_latitude', 'rate_code',
                         'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 
                         'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', 
                         'tolls_amount', 'total_amount']
    
    # Remove the trips outside the required coordinate box
    dataframe = remove_outside_trip(dataframe)
    # Remove the trips with zero passenger count
    dataframe = dataframe[dataframe['passenger_count'] != 0]
    # Remove the trips without a fare
    dataframe = dataframe[dataframe['fare_amount'] != 0]
    # Remove the trips with no distance between pickup and dropoff locations
    dataframe = dataframe[dataframe['trip_distance'] != 0]
    dataframe = dataframe[(dataframe['pickup_longitude'] != dataframe['dropoff_longitude']) 
                          & (dataframe['pickup_latitude'] != dataframe['dropoff_latitude'])]
    
    # Sample the taxi data at a appropriate number
    dataframe = dataframe.sample(n = 2500, random_state=1)
    
    # Choose useful columns for the coming analysis
    columns_to_keep = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
                       'dropoff_longitude', 'dropoff_latitude', 'tip_amount']
    dataframe = dataframe[columns_to_keep]
    
    # Transform the pickup datetime column from strings to datetime
    dataframe['pickup_datetime'] = pd.to_datetime(dataframe['pickup_datetime'],
                                                  format='%Y-%m-%d %H:%M:%S') 
    
    return dataframe

In [None]:
def clean_taxi_df_2011_to_2015(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Clean the yellow taxi data from 2011 to 2015."""
    # Normalize the column names
    dataframe.columns = ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
                         'trip_distance', 'rate_code', 'store_and_fwd_flag', 'pickup_zoneid',
                         'dropoff_zoneid', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax',
                         'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount',
                         'congestion_surcharge', 'airport_fee']
    
    # Remove the trips outside the 1-263 zones
    dataframe = dataframe[dataframe['pickup_zoneid'] != 264]
    dataframe = dataframe[dataframe['pickup_zoneid'] != 265]
    dataframe = dataframe[dataframe['dropoff_zoneid'] != 264]
    dataframe = dataframe[dataframe['dropoff_zoneid'] != 265]
    # Remove the trips with zero passenger count
    dataframe = dataframe[dataframe['passenger_count'] != 0]
    # Remove the trips without a fare
    dataframe = dataframe[dataframe['fare_amount'] != 0]
    # Remove the trips with no distance between pickup and dropoff locations
    dataframe = dataframe[dataframe['trip_distance'] != 0]
    
    # Sample the taxi data at a appropriate number
    dataframe = dataframe.sample(n = 2500, random_state=1)
    
    # Choose useful columns for the coming analysis
    columns_to_keep = ['pickup_datetime', 'trip_distance',
                       'pickup_zoneid', 'dropoff_zoneid', 'tip_amount']
    dataframe = dataframe[columns_to_keep]
    
    # Generate the coordinates from zone IDs
    dataframe = generate_coords_from_zones(dataframe)
    
    return dataframe

In [None]:
def get_and_clean_month(url: str) -> pd.DataFrame:
    """Load and clean the parquet file for the URL, return it as a DataFrame."""
    # Programmatically download needed data if not exists
    dataframe = pd.DataFrame()
    
    time_pattern = r"(2009-(0[1-9]|1[0-2]))|(201[0-4]-(0[1-9]|1[0-2]))|(2015-(0[1-6]))"
    time = ""
    
    if re.search(time_pattern, url):
        time = re.search(time_pattern, url).group(0)
        file_path = f"{TAXI_DIR}/yellow_taxi_{time}.parquet"
        
        # Check if the parquet file has already been downloaded
        if os.path.exists(file_path):
            print(f"File {file_path} already exists.")
            dataframe = pd.read_parquet(file_path, engine='pyarrow')
        else:
            # If not, download the file from the given URL
            print(f"File {file_path} does not exist. Downloading...")
            response = requests.get(url, stream=True)
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
                print(f"File {file_path} downloaded successfully.")
            dataframe = pd.read_parquet(file_path, engine='pyarrow')
    
    # Clean the dataframe
    if re.search(r"2009|2010", time):
        dataframe_cleaned = clean_taxi_df_2009_to_2010(dataframe)
    else:
        dataframe_cleaned = clean_taxi_df_2011_to_2015(dataframe)
    
    return dataframe_cleaned

In [None]:
def get_and_clean_taxi_data(parquet_urls: list) -> pd.DataFrame:
    """Preprocess and concatenate all the data, return them as a DataFrame."""
    all_taxi_dataframes = []
    
    # Iterate the URLs and obtain the dataframe for each month
    for parquet_url in parquet_urls:
        dataframe = get_and_clean_month(parquet_url)
        # Add the 'distance' column
        dataframe = add_distance_column(dataframe)
        all_taxi_dataframes.append(dataframe)

    # Create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)
    
    return taxi_data

In [None]:
def get_taxi_data() -> pd.DataFrame:
    """Scrap the yellow taxi data and return the result as a DataFrame."""
    all_urls = get_all_urls_from_taxi_page(TAXI_URL)
    all_parquet_urls = filter_taxi_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    
    return taxi_data

In [None]:
taxi_data = get_taxi_data()

In [None]:
taxi_data.head()

### Processing Uber Data

In [None]:
def load_and_clean_uber_data(csv_file: str) -> pd.DataFrame:
    """Load and clean the Uber data and return it as a DataFrame."""
    columns_to_keep = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
                       'dropoff_longitude', 'dropoff_latitude']
    dataframe = pd.read_csv(csv_file, usecols = columns_to_keep)
    # Transform the pickup datetime column from strings to datetime
    dataframe['pickup_datetime'] = pd.to_datetime(dataframe['pickup_datetime'],
                                                  format='%Y-%m-%d %H:%M:%S %Z')
    dataframe['pickup_datetime'] = dataframe['pickup_datetime'].dt.tz_convert(None)
    
    # Remove the trips outside the defined coordinate box
    dataframe = remove_outside_trip(dataframe)
    # Remove the trips with no distance between pickup and dropoff locations
    dataframe = dataframe[(dataframe['pickup_longitude'] != dataframe['dropoff_longitude']) 
                          & (dataframe['pickup_latitude'] != dataframe['dropoff_latitude'])]
    
    return dataframe

In [None]:
def get_uber_data() -> pd.DataFrame:
    """Return the processed Uber data as a DataFrame."""
    uber_dataframe = load_and_clean_uber_data(UBER_CSV)
    # Add the 'distance' column
    add_distance_column(uber_dataframe)
    uber_dataframe.dropna(axis=0, inplace=True)
    
    return uber_dataframe

In [None]:
uber_data = get_uber_data()

In [None]:
uber_data.head()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
daily_weather_data.head()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
engine.execute(QUERY_1).fetchall()

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)