In [1]:
from pyspark.sql import SparkSession
from urllib.request import urlretrieve
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("ADS project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.executor.memory","2G")
    .config("spark.driver.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/08/22 01:44:27 WARN Utils: Your hostname, Luo resolves to a loopback address: 127.0.1.1; using 172.17.1.121 instead (on interface eth0)
22/08/22 01:44:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/22 01:44:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/22 01:44:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
spark.conf.set("spark.sql.parquet.compression.codec", "gzip")

In [3]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [4]:
YEARS = ['2021']
MONTHS = range(1, 13)
path = f"../data/curated/tlc_data/"
external_cols = ['Month,Date', 'Temperature (F)', 'Wind Speed (mph)', 'is_rainy']

In [5]:
# Cited from ADS tutorial 2
def create_geo():
    sf = gpd.read_file("../data/raw/taxi_zones/taxi_zones.shp")
    zones = pd.read_csv("../data/raw/taxi_zones/taxi+_zone_lookup.csv")
    sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
    gdf = gpd.GeoDataFrame(
        pd.merge(zones, sf, on='LocationID', how='inner')
    )
    # create a JSON 
    geoJSON = gdf[['LocationID', 'geometry']].drop_duplicates('LocationID').to_json()
    
    return gdf, geoJSON


In [6]:
# create the proportion that are going to be displayed on the map
def create_proportion(sdf, PuDo, gdf):
    proportions = sdf[[f'{PuDo}LocationID', 'tip_amount']] \
            .groupby(f'{PuDo}LocationID') \
            .agg(
                {
                    'tip_amount': 'sum', # sum over total amount earned
                    f'{PuDo}LocationID': 'count' # count the number of trips with respect to PickUp or DropOff Locations
                }
            )\
            .withColumn('avg_tip_amount', col('sum(tip_amount)') / col(f'count({PuDo}LocationID)'))
    # Link the LocationID with their actual LocationName
    df = proportions.toPandas() \
    .merge(gdf[['LocationID', 'geometry']], left_on=f'{PuDo}LocationID', right_on='LocationID') \
    .drop('LocationID', axis=1).drop('sum(tip_amount)', axis = 1)
    return df

In [7]:
# Plot the map
# Cited from ADS Tutorial 2
def plot_map(df, geoJson, PuDo): 
    # initialise the map
    m = folium.Map(location=[40.73, -73.74], tiles="Stamen Terrain", zoom_start=10)
    c = folium.Choropleth(
                geo_data=geoJson, # geoJSON 
                name='choropleth', # name of plot
                data=df, # data source
                columns=[f'{PuDo}LocationID','avg_tip_amount'], # the columns required
                key_on='properties.LocationID', # this is from the geoJSON's properties
                fill_color='PuRd', # color scheme
                fill_opacity=0.7,
                line_opacity=0,
                nan_fill_color='black',
                legend_name='Average Tip Earnings USD$'
            )

    c.add_to(m)
    return m

In [8]:
# plot the average Tip amount in different PickUp and DropOff Locations in 2021
sdf = spark.read.parquet(f'{path}2021/final_data/*/')
gdf, geoJSON = create_geo()
df_pu = create_proportion(sdf, 'PU', gdf)
m_pu = plot_map(df_pu, geoJSON, 'PU')
df_do = create_proportion(sdf, 'DO', gdf)
m_do = plot_map(df_do, geoJSON, 'DO')

                                                                                

In [9]:
m_pu.save('../plots/Pick Up Map.html')

In [10]:
m_do.save('../plots/Drop Off Map.html')