## Feature Selection

In [1]:
# Import
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import Vectors

In [2]:
# Path
zone_lookup = "hdfs://10.128.0.59:8020/raw_data/updated_zone_lookup.csv"
fact_trip = "hdfs://10.128.0.59:8020/data_warehouse/fact_trip"
dim_vendor = "hdfs://10.128.0.59:8020/data_warehouse/dim_vendor"
dim_datetime = "hdfs://10.128.0.59:8020/data_warehouse/dim_datetime"
dim_rate_code = "hdfs://10.128.0.59:8020/data_warehouse/dim_rate_code"
dim_pickup_location = "hdfs://10.128.0.59:8020/data_warehouse/dim_pickup_location"
dim_dropoff_location = "hdfs://10.128.0.59:8020/data_warehouse/dim_dropoff_location"
dim_payment = "hdfs://10.128.0.59:8020/data_warehouse/dim_payment"

In [3]:
# Starting SparkSession
spark = SparkSession.builder \
    .appName("Feature Selection") \
    .getOrCreate()

24/12/29 04:30:30 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
df_fact = spark.read \
    .format("parquet") \
    .option("path", fact_trip) \
    .load()

df_datetime = spark.read \
    .format("parquet") \
    .option("path", dim_datetime) \
    .load() \
    .select(
        "datetime_id",
        "pick_hour",
        "pick_weekday_id",
        "drop_hour",
        "drop_weekday_id"
    )

df_joined = df_fact \
    .join(df_datetime,
          df_fact.datetimestamp_id == df_datetime.datetime_id, "inner") \
    .drop("datetimestamp_id", "datetime_id")

df_joined.printSchema()

                                                                                

root
 |-- trip_id: long (nullable = true)
 |-- vendor_id: long (nullable = true)
 |-- pu_location_id: long (nullable = true)
 |-- do_location_id: long (nullable = true)
 |-- ratecode_id: long (nullable = true)
 |-- payment_id: long (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- pick_hour: double (nullable = true)
 |-- pick_weekday_id: integer (nullable = true)
 |-- drop_hour: double (nullable = true)
 |-- drop_weekday_id: integer (nullable = true)



In [5]:
# Candidate Columns
selected_columns = [
    "vendor_id",
    "pu_location_id",
    "do_location_id",
    "ratecode_id",
    "payment_id",
    "passenger_count",
    "trip_distance",
    "fare_amount",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "pick_hour",
    "pick_weekday_id",
    "drop_hour",
    "drop_weekday_id"
]

# Create VectorAssembler
assembler = VectorAssembler(
    inputCols=selected_columns,
    outputCol="features"
)

data_transformed = assembler.transform(df_joined)

In [6]:
# Calculate Correlation Matrix
correlation_data = data_transformed \
    .select("features", "total_amount") \
    .rdd.map(
        lambda row: (Vectors.dense(row["features"]), row["total_amount"])
    )

correlation_matrix = Correlation.corr(data_transformed, "features").head()[0]

                                                                                

In [7]:
print("Correlation Matrix:\n", correlation_matrix)

Correlation Matrix:
 DenseMatrix([[ 1.00000000e+00, -1.12586140e-02, -5.43222565e-03,
              -2.75311440e-02, -3.90845551e-02,  2.19539365e-01,
              -6.83620222e-04,  2.35051692e-03, -2.41183647e-01,
              -2.56260811e-02,  5.44782393e-04,  1.74531049e-02,
               1.01276146e-02,  9.96225826e-04,  1.10144255e-02,
               1.17125830e-03],
             [-1.12586140e-02,  1.00000000e+00,  9.73711081e-02,
              -1.78964300e-02, -1.22525009e-02, -8.48152757e-03,
              -1.24612028e-04, -6.92467668e-03,  4.13715730e-03,
               3.64392619e-03, -2.56773549e-04, -4.14928555e-02,
               7.65160508e-04,  1.18298441e-04,  8.57771155e-04,
               1.12474329e-04],
             [-5.43222565e-03,  9.73711081e-02,  1.00000000e+00,
              -1.30290551e-02, -1.35494608e-02, -6.74468988e-03,
              -1.12744970e-04, -7.38824213e-03,  5.35659787e-03,
               2.59881299e-02, -4.92959767e-04, -4.37570258e-02,
     

In [8]:
spark.stop()

### Result Features

- 