In [0]:
display(dbutils.fs.ls("/dbfs/FileStore/tables/"))

path,name,size,modificationTime
dbfs:/dbfs/FileStore/tables/Data_Processing_Pulic_Holiday/,Data_Processing_Pulic_Holiday/,0,1732626342968
dbfs:/dbfs/FileStore/tables/Eda_univariate_taxi_data/,Eda_univariate_taxi_data/,0,1732626342968
dbfs:/dbfs/FileStore/tables/cleaned_nyc_taxi_fare/,cleaned_nyc_taxi_fare/,0,1732626342968
dbfs:/dbfs/FileStore/tables/data_processed_lat_long/,data_processed_lat_long/,0,1732626342968
dbfs:/dbfs/FileStore/tables/data_processed_taxi_zones/,data_processed_taxi_zones/,0,1732626342968
dbfs:/dbfs/FileStore/tables/data_processed_trip_data/,data_processed_trip_data/,0,1732626342968


In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("OptimizedRF") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.cores", "4") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

In [0]:
# Load Delta table with PySpark
taxi_df = spark.read.format("delta").load("dbfs:/dbfs/FileStore/tables/cleaned_nyc_taxi_fare/")

In [0]:
from pyspark.sql.functions import col
# Summary Statistics for 'trip_distance'
taxi_df.describe("trip_distance").show()

+-------+------------------+
|summary|     trip_distance|
+-------+------------------+
|  count|          36395607|
|   mean|3.4915285498000985|
| stddev| 54.90094331972668|
|    min|               0.0|
|    max|          161726.1|
+-------+------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_distance' column to a Pandas DataFrame or Series
trip_distance_df = taxi_df.select("trip_distance").toPandas()["trip_distance"]

# Assuming 'trip_distance' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(trip_distance_df, bins=50, kde=True)
plt.title("Distribution of Trip Distance")

plt.subplot(1, 2, 2)
sns.boxplot(x=trip_distance_df)
plt.title("Box Plot of Trip Distance")
plt.show()

In [0]:
from pyspark.sql.functions import col

# Filter out rows where trip_duration is less than or equal to zero
taxi_df = taxi_df.filter(col("trip_distance") > 0)

In [0]:
from pyspark.sql import functions as F

# Calculate Q1 and Q3
q1, q3 = taxi_df.approxQuantile("trip_distance", [0.25, 0.75], 0.05)
iqr = q3 - q1

# Define lower and upper bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [0]:
# Filter for outliers based on updated bounds
outliers = taxi_df.filter((col("trip_distance") < lower_bound) | (col("trip_distance") > upper_bound))
outliers.select("trip_distance").describe().show()

+-------+------------------+
|summary|     trip_distance|
+-------+------------------+
|  count|           5117008|
|   mean|13.453778076241504|
| stddev|145.99119718177963|
|    min|              6.21|
|    max|          161726.1|
+-------+------------------+



In [0]:
from pyspark.sql.functions import when

# Cap trip_distance values at the upper bound
taxi_df = taxi_df.withColumn("trip_distance", when(col("trip_distance") > upper_bound, upper_bound).otherwise(col("trip_distance")))

In [0]:
# Summary Statistics for 'trip_duration'
taxi_df.describe("trip_distance").show()

+-------+--------------------+
|summary|       trip_distance|
+-------+--------------------+
|  count|            35979605|
|   mean|   2.500268124273792|
| stddev|  1.8919592804162233|
|    min|0.009999999776482582|
|    max|   6.199999809265137|
+-------+--------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F

# Calculate skewness and kurtosis for the trip_distance column
trip_distance_stats = taxi_df.select(
    F.skewness("trip_distance").alias("skewness"),
    F.kurtosis("trip_distance").alias("kurtosis")
)

# Display the results
trip_distance_stats.show()

+------------------+-------------------+
|          skewness|           kurtosis|
+------------------+-------------------+
|0.9742337813233402|-0.4076524384780611|
+------------------+-------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_distance' column to a Pandas DataFrame or Series
trip_distance_df = taxi_df.select("trip_distance").toPandas()["trip_distance"]

# Assuming 'trip_distance' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(trip_distance_df, bins=50, kde=True)
plt.title("Distribution of Trip Distance")

plt.subplot(1, 2, 2)
sns.boxplot(x=trip_distance_df)
plt.title("Box Plot of Trip Distance")
plt.show()

In [0]:
from pyspark.sql.functions import col
# Summary Statistics for 'trip_duration'
taxi_df.describe("trip_duration").show()

+-------+------------------+
|summary|     trip_duration|
+-------+------------------+
|  count|          35979605|
|   mean|17.375825997669924|
| stddev| 41.14495583559163|
|    min|           -1177.0|
|    max|         7053.6167|
+-------+------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_duration' column to a Pandas DataFrame or Series
trip_duration_df = taxi_df.select("trip_duration").toPandas()["trip_duration"]

# Assuming 'trip_duration' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(trip_duration_df, bins=50, kde=True)
plt.title("Distribution of Trip Duration")

plt.subplot(1, 2, 2)
sns.boxplot(x=trip_duration_df)
plt.title("Box Plot of Trip Duration")
plt.show()

In [0]:
from pyspark.sql.functions import col

# Filter out rows where trip_duration is less than or equal to zero
taxi_df = taxi_df.filter(col("trip_duration") > 0)

In [0]:
from pyspark.sql import functions as F

# Calculate Q1 and Q3
q1, q3 = taxi_df.approxQuantile("trip_duration", [0.25, 0.75], 0.05)
iqr = q3 - q1

# Define lower and upper bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [0]:
# Filter for outliers based on updated bounds
outliers = taxi_df.filter((col("trip_duration") < lower_bound) | (col("trip_duration") > upper_bound))
outliers.select("trip_duration").describe().show()

+-------+------------------+
|summary|     trip_duration|
+-------+------------------+
|  count|           2495853|
|   mean| 68.02173419564333|
| stddev|144.11624241816043|
|    min|         37.633335|
|    max|         7053.6167|
+-------+------------------+



In [0]:
from pyspark.sql.functions import when

# Cap trip_duration values at the upper bound
taxi_df = taxi_df.withColumn("trip_duration", when(col("trip_duration") > upper_bound, upper_bound).otherwise(col("trip_duration")))

In [0]:
# Summary Statistics for 'trip_duration'
taxi_df.describe("trip_duration").show()

+-------+-------------------+
|summary|      trip_duration|
+-------+-------------------+
|  count|           35978259|
|   mean| 15.268764828400286|
| stddev|  9.904807674270454|
|    min|0.01666666753590107|
|    max|  37.62500190734863|
+-------+-------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F

# Calculate skewness and kurtosis for the trip_duration column
fare_stats = taxi_df.select(
    F.skewness("trip_duration").alias("skewness"),
    F.kurtosis("trip_duration").alias("kurtosis")
)

# Display the results
fare_stats.show()

+------------------+-------------------+
|          skewness|           kurtosis|
+------------------+-------------------+
|0.9097695585248387|-0.1044367628372842|
+------------------+-------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_duration' column to a Pandas DataFrame or Series
trip_duration_df = taxi_df.select("trip_duration").toPandas()["trip_duration"]

# Assuming 'trip_duration' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(trip_duration_df, bins=50, kde=True)
plt.title("Distribution of Trip Duration")

plt.subplot(1, 2, 2)
sns.boxplot(x=trip_duration_df )
plt.title("Box Plot of Trip Duration")
plt.show()

In [0]:
from pyspark.sql.functions import col
import matplotlib.pyplot as plt

# Define city border coordinates
city_long_border = (-74.1,73.8)
city_lat_border = (40.5, 40.9)

# Filter data for valid latitude and longitude values
taxi_df = taxi_df.filter(
    (col("pickup_longitude") >= city_long_border[0]) & 
    (col("pickup_longitude") <= city_long_border[1]) &
    (col("pickup_latitude") >= city_lat_border[0]) & 
    (col("pickup_latitude") <= city_lat_border[1]) &
    (col("dropoff_longitude") >= city_long_border[0]) & 
    (col("dropoff_longitude") <= city_long_border[1]) &
    (col("dropoff_latitude") >= city_lat_border[0]) & 
    (col("dropoff_latitude") <= city_lat_border[1]))

In [0]:
# import folium
# from folium.plugins import MarkerCluster

# # Initialize the map centered on New York City
# m = folium.Map(location=[40.7128, -74.0060], zoom_start=12)

# # Add a MarkerCluster layer
# marker_cluster = MarkerCluster().add_to(m)

# # Take a 1% random sample of the data and select only the required columns
# sampled_taxi_df = taxi_df.sample(fraction=0.01).select("pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude").collect()

# # Add markers for each pickup and dropoff location in the sampled data
# for row in sampled_taxi_df:
#     # Add a marker for the pickup location
#     folium.Marker([row['pickup_latitude'], row['pickup_longitude']], 
#                   popup="Pickup Location").add_to(marker_cluster)
    
#     # Add a marker for the dropoff location
#     folium.Marker([row['dropoff_latitude'], row['dropoff_longitude']], 
#                   popup="Dropoff Location").add_to(marker_cluster)

# # Display the map
# m


In [0]:
display(taxi_df)

In [0]:
# Function to display value counts for a specific column using display()
def display_value_counts(dataframe, column):
    print(f"Value counts for {column}:")
    value_counts = dataframe.groupBy(column).count().orderBy("count", ascending=False)
    display(value_counts)

# Example usage for a specific column
display_value_counts(taxi_df, 'pickup_service_zone')

Value counts for pickup_service_zone:


pickup_service_zone,count
Yellow Zone,31545375
Airports,3129124
Boro Zone,1190267


In [0]:
# Example usage for a specific column
display_value_counts(taxi_df, 'pickup_zone')

Value counts for pickup_zone:


pickup_zone,count
JFK Airport,1864633
Upper East Side South,1740117
Midtown Center,1705992
Upper East Side North,1534583
Midtown East,1312557
Penn Station/Madison Sq West,1269132
LaGuardia Airport,1264491
Times Sq/Theatre District,1210820
Lincoln Square East,1206132
Murray Hill,1086165


| Column               | Type            | Purpose                                                    |
|----------------------|-----------------|------------------------------------------------------------|
| `DOLocationID`       | Categorical     | Destination location                                       |
| `PULocationID`       | Categorical     | Pickup location                                            |
| `passenger_count`    | Numerical       | Number of passengers                                       |
| `trip_distance`      | Numerical       | Trip distance                                              |
| `total_amount`       | Numerical       | Target variable (taxi fare)                                |
| `trip_duration`      | Numerical       | Trip duration                                              |
| `pickup_hour`        | Categorical     | Hour of pickup                                             |
| `pickup_day_of_week` | Categorical     | Day of the week (pickup)                                   |
| `pickup_month`       | Categorical     | Month of pickup                                            |
| `distance_km`        | Numerical       | Calculated trip distance in km                             |
| `pickup_borough`     | Categorical     | Pickup borough                                             |
| `dropoff_borough`    | Categorical     | Drop-off borough                                           |
| `is_holiday`         | Categorical     | Indicator for holidays                                     |
| `temp`               | Numerical       | Temperature (optional)                                     |

####  Bins for Distance

In [0]:
# Binning trip_distance into categories: Short, Medium, Long
taxi_df = taxi_df.withColumn(
    "distance_bin", 
    when(col("trip_distance") <= 2, "Short")
    .when((col("trip_distance") > 2) & (col("trip_distance") <= 5), "Medium")
    .otherwise("Long")
)

In [0]:
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt

distance_distribution = (
    taxi_df.groupBy("distance_bin")
    .agg(F.count("*").alias("count"))
)

total_count = distance_distribution.agg(F.sum("count")).collect()[0][0]
distance_distribution = distance_distribution.withColumn("percentage", (F.col("count") / total_count) * 100)

distance_distribution_pd = distance_distribution.toPandas()

plt.figure(figsize=(8, 6))
plt.pie(distance_distribution_pd["percentage"], labels=distance_distribution_pd["distance_bin"], autopct='%1.1f%%')
plt.title("Distribution of Trip Distances")
plt.show()


####  Bins for  Temperature

In [0]:

# Binning temperature into ranges: Cold, Cool, Warm, Hot
taxi_df = taxi_df.withColumn(
    "temp_bin", 
    when(col("temp") <= 10, "Cold")
    .when((col("temp") > 10) & (col("temp") <= 20), "Cool")
    .when((col("temp") > 20) & (col("temp") <= 30), "Warm")
    .otherwise("Hot")
)

In [0]:
temp_distribution = (
    taxi_df.groupBy("temp_bin")
    .agg(F.count("*").alias("count"))
)

total_count = temp_distribution.agg(F.sum("count")).collect()[0][0]
temp_distribution = temp_distribution.withColumn("percentage", (F.col("count") / total_count) * 100)

temp_distribution_pd = temp_distribution.toPandas()

plt.figure(figsize=(8, 6))
plt.pie(temp_distribution_pd["percentage"], labels=temp_distribution_pd["temp_bin"], autopct='%1.1f%%')
plt.title("Distribution of Temperature Bins")
plt.show()


#### Binning Time of Day

In [0]:
# Binning pickup_hour into time of day categories
taxi_df = taxi_df.withColumn(
    "time_of_day_bin",
    when((col("pickup_hour") >= 0) & (col("pickup_hour") < 6), "Late Night")
    .when((col("pickup_hour") >= 6) & (col("pickup_hour") < 12), "Morning")
    .when((col("pickup_hour") >= 12) & (col("pickup_hour") < 18), "Afternoon")
    .otherwise("Evening")
)

In [0]:
time_of_day_distribution = (
    taxi_df.groupBy("time_of_day_bin")
    .agg(F.count("*").alias("count"))
)

total_count = time_of_day_distribution.agg(F.sum("count")).collect()[0][0]
time_of_day_distribution = time_of_day_distribution.withColumn("percentage", (F.col("count") / total_count) * 100)

time_of_day_distribution_pd = time_of_day_distribution.toPandas()

plt.figure(figsize=(8, 6))
plt.pie(time_of_day_distribution_pd["percentage"], labels=time_of_day_distribution_pd["time_of_day_bin"], autopct='%1.1f%%')
plt.title("Distribution of Trips by Time of Day")
plt.show()

#### Binning Time of Day

In [0]:
# Binning pickup_hour into time of day categories
taxi_df = taxi_df.withColumn(
    "time_of_day_bin",
    when((col("pickup_hour") >= 0) & (col("pickup_hour") < 6), "Late Night")
    .when((col("pickup_hour") >= 6) & (col("pickup_hour") < 12), "Morning")
    .when((col("pickup_hour") >= 12) & (col("pickup_hour") < 18), "Afternoon")
    .otherwise("Evening")
)

In [0]:
time_of_day_distribution = (
    taxi_df.groupBy("time_of_day_bin")
    .agg(F.count("*").alias("count"))
)

total_count = time_of_day_distribution.agg(F.sum("count")).collect()[0][0]
time_of_day_distribution = time_of_day_distribution.withColumn("percentage", (F.col("count") / total_count) * 100)

time_of_day_distribution_pd = time_of_day_distribution.toPandas()

plt.figure(figsize=(8, 6))
plt.pie(time_of_day_distribution_pd["percentage"], labels=time_of_day_distribution_pd["time_of_day_bin"], autopct='%1.1f%%')
plt.title("Distribution of Trips by Time of Day")
plt.show()

#### Binning Date into Season

In [0]:
from pyspark.sql.functions import month, when

# Create season column based on the pickup month
taxi_df = taxi_df.withColumn(
    "season",
    when(month("pickup_datetime").isin([12, 1, 2]), "Winter")
    .when(month("pickup_datetime").isin([3, 4, 5]), "Spring")
    .when(month("pickup_datetime").isin([6, 7, 8]), "Summer")
    .otherwise("Fall")
)


In [0]:
season_distribution = (
    taxi_df.groupBy("season")
    .agg(F.count("*").alias("count"))
)

total_count = season_distribution.agg(F.sum("count")).collect()[0][0]
season_distribution = season_distribution.withColumn("percentage", (F.col("count") / total_count) * 100)

season_distribution_pd = season_distribution.toPandas()

plt.figure(figsize=(8, 6))
plt.pie(season_distribution_pd["percentage"], labels=season_distribution_pd["season"], autopct='%1.1f%%')
plt.title("Distribution of Trips by Season")
plt.show()

#### near_airport

In [0]:
from pyspark.sql.functions import col, when

# Define near_airport based on pickup or dropoff being in an airport zone
taxi_df = taxi_df.withColumn(
    "near_airport",
    when(
        (col("pickup_service_zone") == "Airports") | 
        (col("dropoff_service_zone") == "Airports"),
        1
    ).otherwise(0)
)

In [0]:
airport_distribution = (
    taxi_df.groupBy("near_airport")
    .agg(F.count("*").alias("count"))
)

total_count = airport_distribution.agg(F.sum("count")).collect()[0][0]
airport_distribution = airport_distribution.withColumn("percentage", (F.col("count") / total_count) * 100)

airport_distribution_pd = airport_distribution.toPandas()

airport_distribution_pd["near_airport"] = airport_distribution_pd["near_airport"].map({1: "Near Airport", 0: "Not Near Airport"})

plt.figure(figsize=(8, 6))
plt.pie(airport_distribution_pd["percentage"], labels=airport_distribution_pd["near_airport"], autopct='%1.1f%%')
plt.title("Distribution of Trips Near Airport")
plt.show()

In [0]:
display(taxi_df.limit(20))

In [0]:
# Get the number of rows and columns
num_rows = taxi_df.count()
num_cols = len(taxi_df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (35864766, 45)


In [0]:
# Display column names of the DataFrame
taxi_df.columns

['DOLocationID',
 'PULocationID',
 'pickup_datetime',
 'dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'payment_type',
 'fare_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'trip_duration',
 'pickup_day_of_week',
 'pickup_hour',
 'Month_Num',
 'pickup_month',
 'dropoff_hour',
 'dropoff_day_of_week',
 'dropoff_month',
 'dropoff_week_of_year',
 'pickup_date',
 'pickup_time',
 'dropoff_date',
 'dropoff_time',
 'pickup_latitude',
 'pickup_longitude',
 'dropoff_latitude',
 'dropoff_longitude',
 'distance_km',
 'pickup_borough',
 'pickup_zone',
 'pickup_service_zone',
 'LocationID',
 'dropoff_borough',
 'dropoff_zone',
 'dropoff_service_zone',
 'holidayName',
 'normalizeHolidayName',
 'is_holiday',
 'temp',
 'distance_bin',
 'temp_bin',
 'time_of_day_bin',
 'season',
 'near_airport']

In [0]:
# Select only the columns needed for price prediction
taxi_df_cleaned = taxi_df.select(
    'DOLocationID',
    'PULocationID',
    'passenger_count',
    'payment_type',
    'total_amount',
    'trip_duration',
    'pickup_day_of_week',
    'pickup_hour',
    'pickup_month',
    'pickup_borough',
    'dropoff_borough',
    'temp',
    'distance_km',
    'is_holiday',
    'distance_bin',
    'temp_bin',
    'time_of_day_bin',
    'season',
    'near_airport'
)


In [0]:
display(taxi_df_cleaned.limit(10))

DOLocationID,PULocationID,passenger_count,payment_type,total_amount,trip_duration,pickup_day_of_week,pickup_hour,pickup_month,pickup_borough,dropoff_borough,temp,distance_km,is_holiday,distance_bin,temp_bin,time_of_day_bin,season,near_airport
132,63,1,1,103.2,10.666666984558104,6,18,11,Brooklyn,Queens,25.1,8.748496195162065,1,Long,Warm,Evening,Fall,1
37,63,1,2,19.9,15.18333339691162,2,12,1,Brooklyn,Brooklyn,14.5,3.9170631517232,1,Medium,Cool,Afternoon,Winter,0
100,63,1,1,66.35,26.46666717529297,2,2,5,Brooklyn,Manhattan,28.3,12.12122772136462,1,Long,Warm,Late Night,Spring,0
225,63,1,1,23.0,16.350000381469727,2,23,1,Brooklyn,Brooklyn,14.5,4.554718927286833,1,Medium,Cool,Evening,Winter,0
96,63,1,1,36.0,9.333333015441896,2,10,1,Brooklyn,Queens,15.3,1.5660724941116029,1,Medium,Cool,Morning,Winter,0
68,10,4,1,87.0,26.450000762939453,2,5,10,Queens,Manhattan,26.5,19.2293942470498,1,Long,Warm,Late Night,Fall,0
170,10,2,2,88.5,35.71666717529297,2,19,10,Queens,Manhattan,26.5,17.557568294766163,1,Long,Warm,Evening,Fall,0
215,10,1,4,71.5,0.7666666507720947,2,17,10,Queens,Queens,26.5,1.7206584626666377,1,Short,Warm,Afternoon,Fall,0
166,10,1,2,78.05,27.01666641235352,2,0,5,Queens,Manhattan,28.3,20.43445253984307,1,Long,Warm,Late Night,Spring,0
161,10,2,2,92.44,30.483333587646484,2,12,12,Queens,Manhattan,21.0,18.025506385625672,1,Long,Warm,Afternoon,Winter,0


In [0]:
# Get the number of rows and columns
num_rows = taxi_df_cleaned.count()
num_cols = len(taxi_df_cleaned.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (35864766, 19)


| Column Name         | Type        | Description                               | Preprocessing Steps                           |
|---------------------|-------------|-------------------------------------------|-----------------------------------------------|
| `DOLocationID`      | Categorical | Dropoff Location ID                       | Encode (One-Hot or Label Encoding)           |
| `PULocationID`      | Categorical | Pickup Location ID                        | Encode (One-Hot or Label Encoding)           |
| `passenger_count`   | Numerical   | Number of passengers                      | Scale (Min-Max or Standard Scaling)          |
| `payment_type`      | Categorical | Type of payment (e.g., cash, card)      | Encode (One-Hot or Label Encoding)           |
| `total_amount`      | Numerical   | Total fare amount                         | Scale (Min-Max or Standard Scaling)          |
| `trip_duration`     | Numerical   | Duration of the trip (in seconds)        | Scale (Min-Max or Standard Scaling)          |
| `pickup_day_of_week`| Categorical | Day of the week (0-6, where 0 = Sunday) | Encode (One-Hot Encoding)                    |
| `pickup_hour`       | Numerical   | Hour of pickup (0-23)                    | Scale (Min-Max or Standard Scaling)          |
| `pickup_month`      | Numerical   | Month of pickup (1-12)                   | Encode (One-Hot Encoding)                     |
| `pickup_borough`    | Categorical | Borough where pickup occurs               | Encode (One-Hot Encoding)                     |
| `dropoff_borough`   | Categorical | Borough where dropoff occurs              | Encode (One-Hot Encoding)                     |
| `temp`              | Numerical   | Temperature (in °C)                      | Scale (Min-Max or Standard Scaling)          |
| `distance_km`     | Numerical   | Distance of the trip (in km)             | Scale (Min-Max or Standard Scaling)          |
| `is_holiday`        | Categorical | Whether the trip occurs on a holiday     | Encode (Binary Encoding)                      |
| `distance_bin`      | Categorical | Binned distance categories (Short, Medium, Long) | Encode (One-Hot Encoding)         |
| `temp_bin`          | Categorical | Binned temperature categories             | Encode (One-Hot Encoding)                     |
| `time_of_day_bin`   | Categorical | Binned time of day (e.g., Morning, Afternoon, Night) | Encode (One-Hot Encoding)    |
| `season`            | Categorical | Season during the trip                    | Encode (One-Hot Encoding)                     |
| `near_airport`      | Categorical | Proximity to airport (0 or 1)            | Encode (Binary Encoding)                      |

In [0]:
taxi_df_cleaned.printSchema()

root
 |-- DOLocationID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)
 |-- temp: double (nullable = true)
 |-- distance_km: double (nullable = true)
 |-- is_holiday: integer (nullable = true)
 |-- distance_bin: string (nullable = false)
 |-- temp_bin: string (nullable = false)
 |-- time_of_day_bin: string (nullable = false)
 |-- season: string (nullable = false)
 |-- near_airport: integer (nullable = false)



In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
from pyspark.ml.regression import LinearRegression

# Define categorical and numerical columns
categorical_cols = [
    'payment_type', 'pickup_day_of_week',
    'pickup_month', 'pickup_borough', 'dropoff_borough', 'is_holiday',
    'distance_bin', 'temp_bin', 'time_of_day_bin', 'season', 'near_airport'
]

numerical_cols = ['passenger_count', 'trip_duration', 'pickup_hour', 'temp', 'distance_km']

the column needs to be transformed into a vector format before it can be used in your pipeline.

In [0]:
# Apply StringIndexer for ordinal encoding on categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in categorical_cols]

In [0]:

# Assemble all features into a single vector, using ordinal indices for categorical features
assembler = VectorAssembler(inputCols=[col + "_index" for col in categorical_cols] + numerical_cols, outputCol="features")

In [0]:
# Scale the features
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")

In [0]:
# Create the pipeline without one-hot encoding
pipeline = Pipeline(stages=indexers + [assembler, scaler])

In [0]:
# Fit the pipeline to the DataFrame
pipeline_model = pipeline.fit(taxi_df_cleaned)

In [0]:
# Transform the DataFrame using the fitted pipeline
preprocessed_df = pipeline_model.transform(taxi_df_cleaned)

In [0]:
preprocessed_df.printSchema()

root
 |-- DOLocationID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)
 |-- temp: double (nullable = true)
 |-- distance_km: double (nullable = true)
 |-- is_holiday: integer (nullable = true)
 |-- distance_bin: string (nullable = false)
 |-- temp_bin: string (nullable = false)
 |-- time_of_day_bin: string (nullable = false)
 |-- season: string (nullable = false)
 |-- near_airport: integer (nullable = false)
 |-- payment_type_index: double (nullable = false)
 |-- pickup_day_of_week_index: double (nullable = false)
 |-- pickup_month_index: double (nullable =

In [0]:
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col

# Compute the correlation matrix using Pearson correlation (default)
correlation_matrix = Correlation.corr(preprocessed_df, "scaled_features").head()[0]

# Convert correlation matrix to a readable format
correlation_array = correlation_matrix.toArray()

# Display the correlation matrix
print("Correlation Matrix:\n", correlation_array)


Correlation Matrix:
 [[ 1.00000000e+00  1.60558507e-02  1.15605511e-02  3.81384993e-02
   5.40724278e-02  8.38274689e-03 -2.00983934e-02 -5.16075839e-03
  -1.03002713e-02  1.16758837e-02  2.87780254e-02  2.51935908e-02
  -2.95889730e-02 -2.29000522e-02 -3.08431346e-04 -1.00297917e-02]
 [ 1.60558507e-02  1.00000000e+00  1.44823078e-03  3.03968912e-02
   1.93907442e-02  1.37150455e-01  3.15790439e-02 -6.31825516e-02
   3.38253446e-02  1.59582076e-03  3.30033468e-02  3.27644228e-02
  -4.43414475e-02 -6.57599034e-02 -1.02827679e-02  3.44100554e-02]
 [ 1.15605511e-02  1.44823078e-03  1.00000000e+00  7.19463762e-03
   7.32651023e-03 -1.27105632e-02  5.77702505e-03 -2.38864255e-01
   4.03386659e-03  5.69972958e-01  4.71068472e-03  5.78305064e-03
  -2.14377217e-02 -9.57019062e-03 -2.47864685e-01  5.63747608e-03]
 [ 3.81384993e-02  3.03968912e-02  7.19463762e-03  1.00000000e+00
   3.68764936e-01  1.39492970e-02  5.00057956e-01 -5.58991408e-03
   2.63683745e-03  4.78308914e-03  7.15415313e-01  1

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.sql.types import NumericType
from pyspark.sql import functions as F
from pyspark.sql.functions import col

In [0]:
from pyspark.sql.types import NumericType

# Define the target column
target_column = "total_amount"

# Automatically identify numerical and indexed categorical features
numerical_features = [
    col for col in preprocessed_df.columns 
    if isinstance(preprocessed_df.schema[col].dataType, NumericType) and col != target_column
]

# Identify ordinally encoded categorical features (those ending in '_index')
indexed_features = [col for col in preprocessed_df.columns if col.endswith('_index')]

In [0]:

# Combine all feature columns (numerical + indexed)
all_features = numerical_features + indexed_features

In [0]:
# Sample data to optimize runtime
sample_fraction = 0.05
sampled_df = preprocessed_df.sample(fraction=sample_fraction, seed=42).cache()

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, StructType, StructField, StringType

# Updated helper function to extract feature importances and normalize them
def get_feature_importance(model, features, importance_col):
    importances = model.stages[-1].featureImportances.toArray()
    
    # Specify schema with types
    schema = StructType([
        StructField("feature", StringType(), True),
        StructField(importance_col, FloatType(), True)
    ])
    
    importance_df = spark.createDataFrame([(feature, float(importance)) for feature, importance in zip(features, importances)], schema)
    total_importance = importance_df.select(F.sum(importance_col)).collect()[0][0]
    return importance_df.withColumn(importance_col, F.col(importance_col) / total_importance)

#### Technique 1: Correlation Analysis

In [0]:
# Define categorical and numerical columns
categorical_cols = [
    'payment_type', 'pickup_day_of_week',
    'pickup_month', 'pickup_borough', 'dropoff_borough', 'is_holiday',
    'distance_bin', 'temp_bin', 'time_of_day_bin', 'season', 'near_airport'
]

numerical_cols = ['passenger_count', 'trip_duration', 'pickup_hour', 'temp', 'distance_km']

# Define the target column
target_column = 'total_amount'

# Collect all feature columns (numerical and categorical index columns)
all_feature_cols = numerical_cols + [col + "_index" for col in categorical_cols]

In [0]:
# Prepare a dictionary to hold correlations
correlations = {}

# Calculate correlations for numerical features
for col in numerical_cols:
    if col in preprocessed_df.columns:  # Check if the column exists
        correlation = preprocessed_df.stat.corr(col, target_column)
        correlations[col] = correlation

In [0]:
# Calculate correlations for ordinally encoded categorical features
for col in categorical_cols:
    indexed_col = col + "_index"
    if indexed_col in preprocessed_df.columns:  # Check if the indexed column exists
        correlation = preprocessed_df.stat.corr(indexed_col, target_column)
        correlations[indexed_col] = correlation

In [0]:
# Calculate mean target for each category in categorical features
for col in categorical_cols:
    if col in preprocessed_df.columns:  # Check if the column exists
        mean_target = preprocessed_df.groupBy(col).agg({target_column: 'mean'}).collect()
        for row in mean_target:
            category = row[col]
            mean_value = row[f'avg({target_column})']  # Access the mean value
            correlations[f'Mean {target_column} for {col} = {category}'] = mean_value

In [0]:
# Convert correlations dictionary to a Spark DataFrame
correlation_importance_df = (
    spark.createDataFrame([(k, abs(v)) for k, v in correlations.items()], ["feature", "correlation_importance"])
)

In [0]:
# Display correlations
display(correlation_importance_df)

feature,correlation_importance
passenger_count,0.0104262769727422
trip_duration,0.2149400315417186
pickup_hour,0.0072058517171343
temp,0.0049022132354904
distance_km,0.2330978747789861
payment_type_index,0.0412747944897081
pickup_day_of_week_index,0.000183357485704726
pickup_month_index,0.0015721197051095
pickup_borough_index,0.1535000428128753
dropoff_borough_index,0.0986033715384023


#### Technique 2: Random Forest Feature Importance

In [0]:
# 2. Random Forest Feature Importance
assembler_rf = VectorAssembler(inputCols=all_features, outputCol="rf_features")

In [0]:
rf = RandomForestRegressor(labelCol=target_column, featuresCol="rf_features", numTrees=50)

In [0]:
pipeline_rf = Pipeline(stages=[assembler_rf, rf])

In [0]:
rf_model = pipeline_rf.fit(sampled_df)

In [0]:
rf_importance_df = get_feature_importance(rf_model, all_features, "rf_importance")

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert specific numerical feature columns from both dataframes to pandas for visualization
feature = numerical_features[0]  # Replace this with the desired feature name
original_feature_data = preprocessed_df.select(feature).toPandas()
sampled_feature_data = sampled_df.select(feature).toPandas()

# Add a column to differentiate original vs. sampled data
original_feature_data['Source'] = 'Original'
sampled_feature_data['Source'] = 'Sampled'

# Concatenate both DataFrames for easy plotting
combined_data = pd.concat([original_feature_data, sampled_feature_data], ignore_index=True)

# Plot distributions side by side on the same plot
plt.figure(figsize=(10, 6))
sns.histplot(data=combined_data, x=feature, hue='Source', kde=True, stat="density", common_norm=False, bins=30)

plt.title(f"Comparison of {feature} Distribution: Original vs Sampled Data")
plt.xlabel(f"{feature}")
plt.ylabel("Density")
plt.legend(title="Data Source")
plt.show()


In [0]:
display(rf_importance_df)

feature,rf_importance
DOLocationID,0.0003642070146488376
PULocationID,0.0015173256076086
passenger_count,1.719316933292378e-05
trip_duration,0.0997924634553572
pickup_day_of_week,3.311540624901743e-07
pickup_hour,0.00020590009372261524
pickup_month,6.352929496780461e-06
temp,1.0165824972696574e-05
distance_km,0.2534162050491099
is_holiday,0.0


#### Technique 3: Gradient Boosting Feature Importance

In [0]:
# 3. Gradient Boosting Feature Importance
assembler_gbt = VectorAssembler(inputCols=all_features, outputCol="gbt_features")

In [0]:
gbt = GBTRegressor(labelCol=target_column, featuresCol="gbt_features", maxIter=50)

In [0]:
pipeline_gbt = Pipeline(stages=[assembler_gbt, gbt])

In [0]:
gbt_model = pipeline_gbt.fit(sampled_df)

In [0]:
gbt_importance_df = get_feature_importance(gbt_model, all_features, "gbt_importance")

In [0]:
display(gbt_importance_df)

feature,gbt_importance
DOLocationID,0.0039745500037976
PULocationID,0.0130816225265678
passenger_count,0.0026635614199107
trip_duration,0.1746674954007604
pickup_day_of_week,3.745181009596559e-05
pickup_hour,0.014672788102965
pickup_month,0.0003005431549091441
temp,0.00031704886875700834
distance_km,0.553901681533922
is_holiday,5.083236549158062e-05


#### Technique 4: Lasso Regression

In [0]:
# 4. Lasso Regression (Linear Regression with L1 regularization) Feature Importance
assembler_lasso = VectorAssembler(inputCols=all_features, outputCol="lasso_features")

In [0]:
lasso = LinearRegression(labelCol=target_column, featuresCol="lasso_features", regParam=0.1, elasticNetParam=1.0)

In [0]:
pipeline_lasso = Pipeline(stages=[assembler_lasso, lasso])

In [0]:
lasso_model = pipeline_lasso.fit(sampled_df)

In [0]:
lasso_coefficients = lasso_model.stages[-1].coefficients.toArray()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql import functions as F
from pyspark.sql.functions import col

# Define the schema explicitly
schema = StructType([
    StructField("feature", StringType(), True),
    StructField("lasso_importance", DoubleType(), True)
])

# Convert the coefficients to native Python float before creating the DataFrame
lasso_importance_data = [(feature, float(abs(coeff))) for feature, coeff in zip(all_features, lasso_coefficients)]

In [0]:
# Create the lasso importance DataFrame with the explicit schema
lasso_importance_df = spark.createDataFrame(lasso_importance_data, schema)

In [0]:
# Normalize Lasso importances
total_lasso_importance = lasso_importance_df.select(F.sum("lasso_importance")).collect()[0][0]

lasso_importance_df = lasso_importance_df.withColumn("lasso_importance", col("lasso_importance") / total_lasso_importance)

In [0]:
# Show the final DataFrame with normalized importance values
display(lasso_importance_df)

feature,lasso_importance
DOLocationID,1.627338799889582e-05
PULocationID,0.0
passenger_count,0.0015260583953919
trip_duration,0.024749044204021
pickup_day_of_week,0.0
pickup_hour,0.0012512864137911
pickup_month,0.0
temp,0.0
distance_km,0.0632088329495043
is_holiday,0.0


#### Technique 5: Recursive Feature Elimination (RFE)

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import col

# List of all feature names
rfe_features = all_features.copy()  
numTrees = 30

In [0]:
# Assemble the features into a single vector column
assembler_rfe = VectorAssembler(inputCols=rfe_features, outputCol="rfe_features")
sampled_df_transformed = assembler_rfe.transform(sampled_df)

In [0]:
# Define and fit the Random Forest model on the transformed data
rf_rfe = RandomForestRegressor(labelCol=target_column, featuresCol="rfe_features", numTrees=numTrees)
rfe_model = rf_rfe.fit(sampled_df_transformed.select("rfe_features", target_column))

In [0]:
# Capture feature importances
importances = rfe_model.featureImportances.toArray()

In [0]:
# Convert all importance scores to Python's native float type
fi_data = [(feature, float(importance)) for feature, importance in zip(rfe_features, importances)]

In [0]:
# Now create the DataFrame with the specified schema
from pyspark.sql.types import StructType, StructField, StringType, FloatType

# Define schema with expected types
schema = StructType([
    StructField("feature", StringType(), True),
    StructField("rfe_score", FloatType(), True)
])

# Create DataFrame with the specified schema
rfe_importance_df = spark.createDataFrame(fi_data, schema=schema)

In [0]:
# Sort the DataFrame by importance scores in descending order
rfe_importance_df_sorted = rfe_importance_df.orderBy(col("rfe_score").desc())

In [0]:
display(rfe_importance_df)

feature,rfe_score
DOLocationID,0.0002555016
PULocationID,0.00043272635
passenger_count,1.7906184e-05
trip_duration,0.10261339
pickup_day_of_week,7.72838e-06
pickup_hour,0.00023615516
pickup_month,1.9048645e-06
temp,9.3054605e-06
distance_km,0.26247764
is_holiday,0.0


#### Technique 6: SHAP Values

In [0]:
import xgboost as xgb
import shap
import numpy as np
import pandas as pd

# Convert the Spark DataFrame to Pandas for model training
sampled_pd_df = sampled_df.select(all_features + [target_column]).toPandas()

In [0]:
# Define the features and target
X = sampled_pd_df[all_features]
y = sampled_pd_df[target_column]

In [0]:
# Ensure X and y are in the correct shape
X = np.array(X)
y = np.array(y).ravel()

In [0]:
# Train an XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=50)
xgb_model.fit(X, y)

In [0]:
# Use SHAP to explain the model with correct array format
explainer = shap.Explainer(xgb_model, X)

In [0]:
shap_values = explainer(X).values

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType
# Compute mean absolute SHAP values for each feature (importance scores) and convert to Python floats
shap_importances = [(feature, float(np.abs(shap_values[:, i]).mean())) for i, feature in enumerate(all_features)]

In [0]:
# Define schema and create a PySpark DataFrame for SHAP importances
schema = StructType([
    StructField("feature", StringType(), True),
    StructField("shap_importance", FloatType(), True)
])

In [0]:
# Convert list to PySpark DataFrame with the specified schema
shap_importance_df = spark.createDataFrame(shap_importances, schema=schema)

In [0]:
# Show SHAP feature importances
display(shap_importance_df)

feature,shap_importance
DOLocationID,0.1358158
PULocationID,0.3837969
passenger_count,0.2419425
trip_duration,5.517781
pickup_day_of_week,0.11919089
pickup_hour,0.77487856
pickup_month,0.088868536
temp,0.035204288
distance_km,2.2857487
is_holiday,0.01680682


#### Technique 7: Statistical Tests

#### Chi-Square Test for categorical features

In [0]:
# df.printSchema()

In [0]:
# from pyspark.ml.stat import ChiSquareTest
# from pyspark.ml.feature import VectorAssembler

# # Step 1: Select relevant categorical columns for the Chi-square test
# categorical_columns = [
#     "pickup_day_of_week_index",
#     "distance_bin_index",
#     "temp_bin_index",
#     "season_index",
#     "near_airport_index"
# ]

# # Step 2: Vectorize the categorical features (you might need to ensure 'payment_type_index' is numeric)
# assembler = VectorAssembler(
#     inputCols=categorical_columns,
#     outputCol="features"
# )

# # Apply vector assembler to the dataframe
# df = assembler.transform(df)

# # Step 3: Perform the Chi-square test for each feature individually against the label
# for column in categorical_columns:
#     # Perform the Chi-square test on the vectorized features and label (payment_type_index)
#     chi_square_result = ChiSquareTest.test(df, "features", "payment_type_index")
    
#     # Step 4: Show the results for the current feature
#     chi_square_result.select("pValues", "degreesOfFreedom", "statistics").show(truncate=False)
    
#     # Step 5: Extract and print the chi-square statistics and p-values
#     result = chi_square_result.collect()[0]
#     p_values = result['pValues']
#     statistics = result['statistics']
    
#     print(f"Feature: {column}")
#     print(f"Chi-Square Statistic: {statistics[0]}")
#     print(f"P-Value: {p_values[0]}\n")


#### ANOVA F-Test for continuous features

In [0]:
# from pyspark.ml.stat import ANOVA
# import pandas as pd

# # Define categorical features
# categorical_cols = ['pickup_day_of_week', 'payment_type']

# # Calculate ANOVA F-Test for each categorical feature
# anova_results = {}
# for col in categorical_cols:
#     groups = sampled_df.groupBy(col).agg(F.collect_list(target_col).alias("values"))
#     # Create a DataFrame for ANOVA
#     anova_df = groups.toPandas()

#     f_stat, p_val = ANOVA.test(anova_df['values'], sampled_df[target_col])
#     anova_results[col] = (f_stat, p_val)

# # Display ANOVA results
# for feature, (f_stat, p_val) in anova_results.items():
#     print(f"Feature: {feature}, F-Statistic: {f_stat}, P-Value: {p_val}")


#### Mutual Information for non-linear associations

In [0]:
#  

#### Technique 8: Feature Selection Based on Statistical Variance

In [0]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import NumericType

# Define the variance threshold
variance_threshold = 0.05  # Adjust as needed

# Identify numerical columns
numerical_columns = [col for col, dtype in sampled_df.dtypes if isinstance(sampled_df.schema[col].dataType, NumericType) and col != target_col]

# Calculate variance for each numerical column
variance_df = sampled_df.select(
    *[F.variance(F.col(c)).alias(c) for c in numerical_columns]
).toPandas()

# Filter columns based on the variance threshold
selected_features = [col for col, var in variance_df.loc[0].items() if var > variance_threshold]

# Display selected features
print("Selected Features based on Variance:")
print(selected_features)


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-3885246210596822>, line 9[0m
[1;32m      6[0m variance_threshold [38;5;241m=[39m [38;5;241m0.05[39m  [38;5;66;03m# Adjust as needed[39;00m
[1;32m      8[0m [38;5;66;03m# Identify numerical columns[39;00m
[0;32m----> 9[0m numerical_columns [38;5;241m=[39m [col [38;5;28;01mfor[39;00m col, dtype [38;5;129;01min[39;00m sampled_df[38;5;241m.[39mdtypes [38;5;28;01mif[39;00m [38;5;28misinstance[39m(sampled_df[38;5;241m.[39mschema[col][38;5;241m.[39mdataType, NumericType) [38;5;129;01mand[39;00m col [38;5;241m!=[39m target_col]
[1;32m     11[0m [38;5;66;03m# Calculate variance for each numerical column[39;00m
[1;32m     12[0m variance_df [38;5;241m=[39m sampled_df[38;5;241m.[39mselect(
[1;32m     13[0m     [38;5;241m*[39m[F[38;5;241m.[39mvariance(F[38

#### Merge Feature Importances

In [0]:
final_importance_df = rf_importance_df \
    .join(gbt_importance_df, on="feature", how="outer") \
    .join(lasso_importance_df, on="feature", how="outer") \
    .join(rfe_importance_df, on="feature", how="outer") \
    .join(shap_importance_df, on="feature", how="outer") \
    .fillna(0)

In [0]:
from pyspark.sql.functions import col
# Calculate average importance across all methods
final_importance_df = final_importance_df.withColumn(
    "average_importance", 
    (col("rf_importance") + col("gbt_importance") + col("lasso_importance") +
     col("rfe_score") + col("shap_importance")) / 5
)

In [0]:
# Show the final DataFrame with average feature importance
display(final_importance_df.orderBy("average_importance", ascending=False))

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Collect the Spark DataFrame into Pandas for plotting
final_importance_pd = final_importance_df.select("feature", "average_importance").toPandas()

# Sort values by average_importance in descending order to have the highest on top
final_importance_pd = final_importance_pd.sort_values(by="average_importance", ascending=False)

# Create the horizontal bar plot
plt.figure(figsize=(10, 6))
sns.barplot(data=final_importance_pd, y='feature', x='average_importance', palette='viridis')

# Set the plot labels and title
plt.title('Average Feature Importance Across Methods', fontsize=16)
plt.xlabel('Average Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)

# Show the plot
plt.tight_layout()  # Adjust layout to avoid label overlap
plt.show()

In [0]:
display(final_importance_pd)

In [0]:
# # Export reduced feature dataset if needed
# selected_features = [row["feature"] for row in final_fi_df.select("feature").collect()]
# reduced_data = transformed_data.select(["total_amount", "features"] + selected_features)


In [0]:
display(taxi_df_cleaned)

In [0]:
# Select only the columns needed for price prediction
taxi_final_df_cleaned = taxi_df_cleaned.select(
    'passenger_count',
    'payment_type',
    'total_amount',
    'trip_duration',
    'pickup_day_of_week',
    'pickup_hour',
    'pickup_month',
    'pickup_borough',
    'dropoff_borough',
    'is_holiday',
    'distance_bin',
    'time_of_day_bin',
    'near_airport'
)


In [0]:
display(taxi_final_df_cleaned)

In [0]:
# Check the Shape and Schema
# Get the number of rows and columns
num_rows = taxi_final_df_cleaned.count()
num_cols = len(taxi_final_df_cleaned.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (35864766, 13)


In [0]:
# Define a more descriptive Delta Lake storage path
delta_path = "/dbfs/FileStore/tables/taxi_final_df_cleaned/"
# Write the DataFrame to Delta format
taxi_final_df_cleaned.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(delta_path)