# K-Means Clustering with Pickup Location

In [1]:
# Import
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.clustering import KMeans

In [2]:
spark = SparkSession \
    .builder \
    .appName('Pickup Location Clustering') \
    .getOrCreate()

print('Spark Version: {}'.format(spark.version))

Spark Version: 3.5.1


24/12/27 01:55:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Path lists
zone_lookup = "hdfs://10.128.0.59:8020/raw_data/updated_zone_lookup.csv"
fact_trip = "hdfs://10.128.0.59:8020/data_warehouse/fact_trip"
dim_vendor = "hdfs://10.128.0.59:8020/data_warehouse/dim_vendor"
dim_datetime = "hdfs://10.128.0.59:8020/data_warehouse/dim_datetime"
dim_rate_code = "hdfs://10.128.0.59:8020/data_warehouse/dim_rate_code"
dim_pickup_location = "hdfs://10.128.0.59:8020/data_warehouse/dim_pickup_location"
dim_dropoff_location = "hdfs://10.128.0.59:8020/data_warehouse/dim_dropoff_location"
dim_payment = "hdfs://10.128.0.59:8020/data_warehouse/dim_payment"

# Result
pickup_kmean = "uber-analysis-439804.query_result.kmean_pickup_location"
dropoff_kmean = "uber-analysis-439804.query_result.kmean_dropoff_location"

In [4]:
df_pickup_location = spark.read \
    .format("parquet") \
    .option("path", dim_pickup_location) \
    .load() \
    .withColumnRenamed("X", "pickup_x") \
    .withColumnRenamed("Y", "pickup_y")

df_dropoff_location = spark.read \
    .format("parquet") \
    .option("path", dim_dropoff_location) \
    .load() \
    .withColumnRenamed("X", "dropoff_x") \
    .withColumnRenamed("Y", "dropoff_y")

# df_pickup_location.printSchema()
# df_dropoff_location.printSchema()

                                                                                

In [5]:
# Normalizing data
vector_assembler_pu = VectorAssembler(
    inputCols=["pickup_x", "pickup_y"], 
    outputCol="features"
)

vector_assembler_do = VectorAssembler(
    inputCols=["dropoff_x", "dropoff_y"],
    outputCol="features"
)

df_pickup_location = vector_assembler_pu.transform(df_pickup_location)
df_dropoff_location = vector_assembler_do.transform(df_dropoff_location)

In [6]:
scaler = StandardScaler(
    inputCol="features", 
    outputCol="scaledFeatures", 
    withMean=True, 
    withStd=True)

scaler_model_pu = scaler.fit(df_pickup_location)
scaler_model_do = scaler.fit(df_dropoff_location)

df_pickup_location = scaler_model_pu.transform(df_pickup_location)
df_dropoff_location = scaler_model_do.transform(df_dropoff_location)

                                                                                

In [7]:
# K-Means CLustering
kmeans = KMeans(
    k=5, 
    seed=42, 
    featuresCol="scaledFeatures", 
    predictionCol="cluster"
)

model_pu = kmeans.fit(df_pickup_location)
model_do = kmeans.fit(df_dropoff_location)

df_pickup_location = model_pu.transform(df_pickup_location)
df_dropoff_location = model_do.transform(df_dropoff_location)

df_pickup_location = df_pickup_location.select(
    "PULocationID",
    "pickup_x",
    "pickup_y",
    "zone",
    "borough",
    "service_zone",
    "cluster"
)

df_dropoff_location = df_dropoff_location.select(
    "DOLocationID",
    "dropoff_x",
    "dropoff_y",
    "zone",
    "borough",
    "service_zone",
    "cluster"
)

                                                                                

In [8]:
# Write to BigQuery
df_pickup_location.write \
    .format("bigquery") \
    .option("table", pickup_kmean) \
    .option("temporaryGcsBucket", "uber-pyspark-jobs/temp") \
    .mode("overwrite") \
    .save()

df_dropoff_location.write \
    .format("bigquery") \
    .option("table", dropoff_kmean) \
    .option("temporaryGcsBucket", "uber-pyspark-jobs/temp") \
    .mode("overwrite") \
    .save()

                                                                                

In [9]:
df_pickup_location.show(5)
df_dropoff_location.show(5)

+------------+------------+-----------+--------------------+---------+------------+-------+
|PULocationID|    pickup_x|   pickup_y|                zone|  borough|service_zone|cluster|
+------------+------------+-----------+--------------------+---------+------------+-------+
|          65|-73.98557106|40.69537261|Downtown Brooklyn...| Brooklyn|   Boro Zone|      3|
|         243|-73.93282432|40.85867029|Washington Height...|Manhattan|   Boro Zone|      1|
|          77|-73.89571716|40.66770187|East New York/Pen...| Brooklyn|   Boro Zone|      0|
|         188|-73.94520016|40.65756006|Prospect-Lefferts...| Brooklyn|   Boro Zone|      0|
|         149|-73.94847421|40.60655799|             Madison| Brooklyn|   Boro Zone|      0|
+------------+------------+-----------+--------------------+---------+------------+-------+
only showing top 5 rows

+------------+------------+-----------+--------------------+---------+------------+-------+
|DOLocationID|   dropoff_x|  dropoff_y|                

In [10]:
spark.stop()