## nn_model_stage4

This optimization attempt uses spark and pandas to create a neural network on Google Colab. The loading and preprocessing is done with spark, but pandas is used for training the neural network. The process works but the results are very poor.

In [170]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import math

from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import to_timestamp, hour, dayofweek, month, dayofyear

from pyspark.sql.types import DoubleType, FloatType, IntegerType
from pyspark.sql.functions import col, radians, sin, cos, asin, sqrt, lit, atan2, degrees
from pyspark.sql.functions import udf
from pyspark.sql.functions import monotonically_increasing_id

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import collect_list

import plotly.graph_objects as go
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

In [51]:
# spark version 3.4.4 performs best; other versions here https://downloads.apache.org/spark/
# spark_version = 'spark-3.4.4'
spark_version = 'spark-3.4.4'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 257 kB in 1s (181 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry miss

In [52]:
# Initialize SparkSession
spark = SparkSession.builder.appName("NYC Taxi Data Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()


# URL of the CSV file
data_url = "https://project4-nyctaxi.s3.us-east-1.amazonaws.com/train.csv"

# Download the file using Pandas
local_file = "train.csv"
pd.read_csv(data_url).to_csv(local_file, index=False)

# Read the local file into a Spark DataFrame
df = spark.read.csv(local_file, header=True, inferSchema=True)

# Show the DataFrame
df.show()

+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|       id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|id2875421|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1|-73.98215484619139| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|
|id2377394|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423827|40.738563537597656|-73.99948120117188|40.731151580810554|                 N|          663|
|id3858529|        2|2016-01-19 11:35:24|2016-01-19 12:10:48|    

In [53]:
# Create a sample for testing, this can be dropped for full data later
df = df.limit(50000)

We can immediately drop 'id' column, this unique qualifier will not be helpful for the machine learning model.



In [54]:
# Drop 'id' column
df = df.drop('id')
df.show()

+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1|-73.98215484619139| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|
|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423827|40.738563537597656|-73.99948120117188|40.731151580810554|                 N|          663|
|        2|2016-01-19 11:35:24|2016-01-19 12:10:48|              1|-73.97902679443358|40.763938903808594|-74.005332

In [55]:
df.dtypes

[('vendor_id', 'int'),
 ('pickup_datetime', 'timestamp'),
 ('dropoff_datetime', 'timestamp'),
 ('passenger_count', 'int'),
 ('pickup_longitude', 'double'),
 ('pickup_latitude', 'double'),
 ('dropoff_longitude', 'double'),
 ('dropoff_latitude', 'double'),
 ('store_and_fwd_flag', 'string'),
 ('trip_duration', 'int')]

In [56]:
# Get the number of rows
num_rows = df.count()

# Get the number of columns
num_cols = len(df.columns)

# Print the shape
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (50000, 10)


Part of the optimization here is identifying outliers. From research and analysis in other parts of this project, we've seen large outliers skewing data. That will be investigated a little bit here.

In [57]:
# Collect `trip_duration` taget data
trip_duration_data = df.select("trip_duration").rdd.flatMap(lambda x: x).collect()

# Calculate quantiles using numpy
Q1 = np.quantile(trip_duration_data, 0.25)
Q3 = np.quantile(trip_duration_data, 0.75)
IQR = Q3 - Q1

# Defining the outlier range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the outliers using PySpark
outliers = df.filter((df['trip_duration'] < lower_bound) | (df['trip_duration'] > upper_bound))

# Create Plotly box plot using collected data
fig = go.Figure(data=[go.Box(y=trip_duration_data, name="Trip Duration")])
fig.update_layout(title='Trip Duration Box Plot')
fig.show()

# Print outliers
outliers.show()

+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|        2|2016-01-19 11:35:24|2016-01-19 12:10:48|              1|-73.97902679443358|40.763938903808594|-74.00533294677734|40.710086822509766|                 N|         2124|
|        2|2016-04-11 17:29:50|2016-04-11 18:08:26|              1|-73.99116516113281| 40.75556182861328| -73.9992904663086|  40.7253532409668|                 N|         2316|
|        2|2016-06-03 08:15:05|2016-06-03 08:56:30|              1|-73.95523071289062| 40.77713394165039|-73.788749

In [58]:
# Remove the outliers from the original DataFrame using a filter
df = df.filter(~df["trip_duration"].isin([row["trip_duration"] for row in outliers.collect()]))

# Get the number of rows
num_rows = df.count()

# Get the number of columns
num_cols = len(df.columns)

# Print the shape
print(f"Shape: ({num_rows}, {num_cols})")

# Display
df.show(5)

Shape: (47458, 10)
+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1|-73.98215484619139| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|
|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423827|40.738563537597656|-73.99948120117188|40.731151580810554|                 N|          663|
|        2|2016-04-06 19:32:31|2016-04-06 19:39:40|              1|-74.01004028320312|   40.7199

Rows before: 1458644

Rows after:  1384424

In total 74,220 outlier rows were removed. Normally this might be a lot but for this large dataset it's only a 5% decrease in size.

In [59]:
# Convert to datetime format using PySpark's to_timestamp function
df = df.withColumn('pickup_datetime', to_timestamp('pickup_datetime'))

# Extract features using PySpark functions
df = df.withColumn('hour', hour('pickup_datetime'))  # Hour of the day (0-23)
df = df.withColumn('day_of_week', dayofweek('pickup_datetime'))  # Day of the week (1=Sunday, 7=Saturday)
df = df.withColumn('month', month('pickup_datetime'))  # Month (1-12)
df = df.withColumn('day_of_year', dayofyear('pickup_datetime'))  # Day of the year (1-366)
df = df.withColumn('is_weekend', (dayofweek('pickup_datetime') >= 6).cast("boolean"))  # Weekend flag (True/False)


df.show(5)

+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+----+-----------+-----+-----------+----------+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|hour|day_of_week|month|day_of_year|is_weekend|
+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+----+-----------+-----+-----------+----------+
|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1|-73.98215484619139| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|  17|          2|    3|         74|     false|
|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423827|40.738563537597656|-73

Create some continuous values for cyclic values

In [60]:
# Calculate cyclic features using a literal pi value
df = df.withColumn('hour_sin', sin(2 * lit(np.pi) * df['hour'] / 24))
df = df.withColumn('hour_cos', cos(2 * lit(np.pi) * df['hour'] / 24))

# Encode day of the week
df = df.withColumn('day_of_week_sin', sin(2 * lit(np.pi) * df['day_of_week'] / 7))
df = df.withColumn('day_of_week_cos', cos(2 * lit(np.pi) * df['day_of_week'] / 7))

df[['hour', 'hour_sin', 'hour_cos', 'day_of_week', 'day_of_week_sin', 'day_of_week_cos']].show()

+----+--------------------+--------------------+-----------+--------------------+--------------------+
|hour|            hour_sin|            hour_cos|day_of_week|     day_of_week_sin|     day_of_week_cos|
+----+--------------------+--------------------+-----------+--------------------+--------------------+
|  17| -0.9659258262890683|-0.25881904510252063|          2|  0.9749279121818236|-0.22252093395631434|
|   0|                 0.0|                 1.0|          1|  0.7818314824680298|  0.6234898018587336|
|  19| -0.9659258262890684|  0.2588190451025203|          4|  -0.433883739117558| -0.9009688679024191|
|  13| -0.2588190451025208| -0.9659258262890683|          7|-2.44929359829470...|                 1.0|
|  22| -0.5000000000000004|  0.8660254037844384|          7|-2.44929359829470...|                 1.0|
|  22| -0.5000000000000004|  0.8660254037844384|          6| -0.7818314824680299|  0.6234898018587334|
|   7|  0.9659258262890683|-0.25881904510252063|          7|-2.4492935982

Here is a tricky stage: creating a custom function in spark to calculate the distance between points. The pandas function was much less complex, the spark function is longer.

In [61]:


def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate haversine distance between two points on Earth

    :param lat1: Latitude of first point
    :param lon1: Longitude of first point
    :param lat2: Latitude of second point
    :param lon2: Longitude of second point
    :return: Distance in kilometers
    """
    # Radius of Earth in kilometers
    R = 6371.0

    # Convert decimal degrees to radians
    def deg2rad(deg):
        return deg * (math.pi / 180)

    # Ensure input values are converted to floats
    try:
        lat1, lon1, lat2, lon2 = map(float, [lat1, lon1, lat2, lon2])
    except (ValueError, TypeError):
        return None

    # Convert to radians
    lat1_rad = deg2rad(lat1)
    lon1_rad = deg2rad(lon1)
    lat2_rad = deg2rad(lat2)
    lon2_rad = deg2rad(lon2)

    # Haversine formula
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    a = (math.sin(dlat/2)**2 +
         math.cos(lat1_rad) * math.cos(lat2_rad) *
         math.sin(dlon/2)**2)

    c = 2 * math.asin(math.sqrt(a))

    return R * c

# Create UDF using standard Python math library
def create_haversine_udf(spark):
    """
    Create a Spark UDF for haversine distance calculation

    :param spark: SparkSession
    :return: Registered UDF
    """
    return spark.udf.register("haversine_distance", haversine, DoubleType())

# Example usage function
def calculate_distances(spark_df):
    """
    Calculate distances using haversine formula

    :param spark_df: Input Spark DataFrame
    :return: DataFrame with distance column added
    """
    # Create the UDF
    haversine_udf = create_haversine_udf(spark)

    # Calculate distances and add new column
    return spark_df.withColumn(
        "distance_km",
        haversine_udf(
            col("pickup_latitude"),
            col("pickup_longitude"),
            col("dropoff_latitude"),
            col("dropoff_longitude")
        )
    )

# Recommended usage in Colab/PySpark
result_df = calculate_distances(df)
result_df.select(
     "pickup_latitude",
     "pickup_longitude",
    "dropoff_latitude",
     "dropoff_longitude",
     "distance_km"
 ).show()

+------------------+------------------+------------------+------------------+------------------+
|   pickup_latitude|  pickup_longitude|  dropoff_latitude| dropoff_longitude|       distance_km|
+------------------+------------------+------------------+------------------+------------------+
| 40.76793670654297|-73.98215484619139|40.765602111816406|-73.96463012695312|1.4985207796458557|
|40.738563537597656|-73.98041534423827|40.731151580810554|-73.99948120117188|1.8055071687965203|
|   40.719970703125|-74.01004028320312| 40.70671844482422|-74.01226806640625|1.4854984227709382|
| 40.79320907592773|-73.97305297851561|40.782520294189446| -73.9729232788086|1.1885884593338754|
| 40.74219512939453|-73.98285675048828| 40.74918365478516|-73.99208068847656| 1.098942459306554|
| 40.75783920288086| -73.9690170288086| 40.76589584350586|-73.95740509033203|1.3262785770590748|
| 40.79777908325195|-73.96927642822266| 40.76055908203125|-73.92247009277344| 5.714980630789905|
|40.738399505615234|-73.999481

Cluster pickup and dropoff zones, problems with changing values while creating KMeans clusters using VectorAssembler. Check df shape to verify no new rows are created, and only two new columns (dropoff/pickup zones).

If this continues to cause problems there's a shapefile with neighborhood zones that could maybe replace this process. That shapefile has I think close to 300 zones, as opposed to the 20 that we're creating here.

In [62]:
# Get the number of rows
num_rows = df.count()

# Get the number of columns
num_cols = len(df.columns)

# Print the shape
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (47458, 19)


Try to add pickup/dropoff zones using KMeans.

In [63]:
# Create feature vectors for pickup and dropoff coordinates separately
pickup_assembler = VectorAssembler(inputCols=["pickup_latitude", "pickup_longitude"], outputCol="pickup_features")
dropoff_assembler = VectorAssembler(inputCols=["dropoff_latitude", "dropoff_longitude"], outputCol="dropoff_features")

# Apply the feature assemblers
df_with_features = pickup_assembler.transform(dropoff_assembler.transform(df))

# Apply KMeans clustering
kmeans = KMeans().setK(20).setSeed(42)

# Fit models and predict clusters
pickup_model = kmeans.setFeaturesCol("pickup_features").setPredictionCol("pickup_zone").fit(df_with_features)
dropoff_model = kmeans.setFeaturesCol("dropoff_features").setPredictionCol("dropoff_zone").fit(df_with_features)

# Transform to add cluster predictions
df_with_clusters = pickup_model.transform(
    dropoff_model.transform(df_with_features)
)

# Select only the original columns plus the two new zone columns
df = df_with_clusters.select(df.columns + ["pickup_zone", "dropoff_zone"])

In [64]:
df.select("pickup_latitude", "pickup_longitude", "pickup_zone", "dropoff_zone").show()

+------------------+------------------+-----------+------------+
|   pickup_latitude|  pickup_longitude|pickup_zone|dropoff_zone|
+------------------+------------------+-----------+------------+
| 40.76793670654297|-73.98215484619139|         12|          13|
|40.738563537597656|-73.98041534423827|          8|          11|
|   40.719970703125|-74.01004028320312|         15|           2|
| 40.79320907592773|-73.97305297851561|          6|          19|
| 40.74219512939453|-73.98285675048828|          8|           3|
| 40.75783920288086| -73.9690170288086|         13|          13|
| 40.79777908325195|-73.96927642822266|          6|           9|
|40.738399505615234|-73.99948120117188|          0|          18|
|40.744338989257805|-73.98104858398438|          4|          19|
| 40.76383972167969|-73.98265075683594|         12|          11|
| 40.74943923950195| -73.9915313720703|         16|          10|
|  40.7566795349121|-73.96298217773438|         13|           0|
| 40.76794052124024|-73.9

In [65]:
# Get the number of rows
num_rows = df.count()

# Get the number of columns
num_cols = len(df.columns)

# Print the shape
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (47458, 21)


The shape of the df remains the same except for the addition of the two new rows. The df appears to have identical values after this KMeans prediction.

Before -> Shape: (1458644, 19)

After  -> Shape: (1458644, 21)



Another helpful tool for machine learning is creating a bearing column, to determine which direction the taxi ride was headed. This could help with accuracy of the model.

In [66]:
# Create calculate_bearing function
def calculate_bearing(lat1, lon1, lat2, lon2):
    """
    Calculate the bearing between two geographical points.

    :param lat1: Latitude of the first point (pickup)
    :param lon1: Longitude of the first point (pickup)
    :param lat2: Latitude of the second point (dropoff)
    :param lon2: Longitude of the second point (dropoff)
    :return: Bearing in degrees (0-360)
    """
    # Convert to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Calculate differences
    dlon = lon2_rad - lon1_rad

    # Calculate bearing
    x = math.sin(dlon) * math.cos(lat2_rad)
    y = math.cos(lat1_rad) * math.sin(lat2_rad) - math.sin(lat1_rad) * math.cos(lat2_rad) * math.cos(dlon)

    # Convert back to degrees and ensure 0-360 range
    bearing = math.degrees(math.atan2(x, y)) % 360

    return bearing

# Create a Spark UDF
bearing_udf = udf(calculate_bearing, DoubleType())

# Usage example
df = df.withColumn('bearing', bearing_udf(
     df['pickup_latitude'],
     df['pickup_longitude'],
     df['dropoff_latitude'],
     df['dropoff_longitude']
 ))

In [67]:
df.show()

+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+----+-----------+-----+-----------+----------+--------------------+--------------------+--------------------+--------------------+-----------+------------+------------------+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|hour|day_of_week|month|day_of_year|is_weekend|            hour_sin|            hour_cos|     day_of_week_sin|     day_of_week_cos|pickup_zone|dropoff_zone|           bearing|
+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+----+-----------+-----+-----------+----------+--------------------+--------------------+--------------------+-------------

A lot of new columns have been added to optimize the neural network model. However, many remain that are potentially not helpful for the model. These columns can't be meaningfully scaled (datetime or coordinates), or their cyclical nature may confuse the model (hours, days of the week). Now that all the calculations are done, these columns can be dropped.

''



In [68]:
# Drop columns
df = df.drop('pickup_datetime', 'dropoff_datetime', 'pickup_latitude',
             'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
             'hour', 'day_of_week')
df.show()


+---------+---------------+------------------+-------------+-----+-----------+----------+--------------------+--------------------+--------------------+--------------------+-----------+------------+------------------+
|vendor_id|passenger_count|store_and_fwd_flag|trip_duration|month|day_of_year|is_weekend|            hour_sin|            hour_cos|     day_of_week_sin|     day_of_week_cos|pickup_zone|dropoff_zone|           bearing|
+---------+---------------+------------------+-------------+-----+-----------+----------+--------------------+--------------------+--------------------+--------------------+-----------+------------+------------------+
|        2|              1|                 N|          455|    3|         74|     false| -0.9659258262890683|-0.25881904510252063|  0.9749279121818236|-0.22252093395631434|         12|          13| 99.97019564715873|
|        1|              1|                 N|          663|    6|        164|     false|                 0.0|                 1

Some of the remaining columns are categorical. In pandas we used `get_dummies` to transform the categorical values into binaries. Convert to pandas to use `get_dummies` and run neural network model. `is_weekend` is a Boolean value which `get_dummies` won't handle.

In [81]:
# Cast the 'is_weekend' column to integer type
df = df.withColumn('is_weekend', df['is_weekend'].cast(IntegerType()))

Export to pandas DataFrame for scaling and machine learning.

`df = df.toPandas()` not functioning. Export and load back in as pandas df. This work around will hopefully allow a neural network to be run.

In [82]:
df.coalesce(1).write.csv("/content/spark_df_single_file.csv", header=True, mode="overwrite")

!mv /content/spark_df_single_file.csv/part-00000-*.csv /content/df_single_file.csv


In [86]:
df = pd.read_csv("/content/df_single_file.csv")

In [87]:
df.head()

Unnamed: 0,vendor_id,passenger_count,store_and_fwd_flag,trip_duration,month,day_of_year,is_weekend,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,pickup_zone,dropoff_zone,bearing
0,2,1,N,455,3,74,0,-0.965926,-0.258819,0.9749279,-0.222521,12,13,99.970196
1,1,1,N,663,6,164,0,0.0,1.0,0.7818315,0.62349,8,11,242.846232
2,2,1,N,429,4,97,0,-0.965926,0.258819,-0.4338837,-0.900969,15,2,187.2623
3,2,1,N,435,3,86,1,-0.258819,-0.965926,-2.449294e-16,1.0,6,19,179.473585
4,2,6,N,443,1,30,1,-0.5,0.866025,-2.449294e-16,1.0,8,3,315.004404


In [88]:
df.dtypes

Unnamed: 0,0
vendor_id,int64
passenger_count,int64
store_and_fwd_flag,object
trip_duration,int64
month,int64
day_of_year,int64
is_weekend,int64
hour_sin,float64
hour_cos,float64
day_of_week_sin,float64


`get_dummies` is required for categorical data.

In [94]:
# List of categorical columns
categorical_cols = ['vendor_id', 'store_and_fwd_flag', 'pickup_zone', 'dropoff_zone']

# Convert integer columns to 'category' dtype
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Separate the numeric and categorical columns
df_numeric = df.drop(columns=categorical_cols)
df_categorical = df[categorical_cols]

In [95]:
# Get_dummies for categorical data
dummies = pd.get_dummies(df_categorical, drop_first=True)

In [96]:
dummies.head()

Unnamed: 0,vendor_id_2,store_and_fwd_flag_Y,pickup_zone_1,pickup_zone_2,pickup_zone_3,pickup_zone_4,pickup_zone_5,pickup_zone_6,pickup_zone_7,pickup_zone_8,...,dropoff_zone_10,dropoff_zone_11,dropoff_zone_12,dropoff_zone_13,dropoff_zone_14,dropoff_zone_15,dropoff_zone_16,dropoff_zone_17,dropoff_zone_18,dropoff_zone_19
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,...,False,True,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
4,True,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [97]:
# Convert boolean columns to numeric
dummies = dummies.astype(int)

In [129]:
# Re-combine the two df
df = pd.concat([df_numeric, dummies], axis=1)
df.head()

Unnamed: 0,passenger_count,trip_duration,month,day_of_year,is_weekend,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,bearing,...,dropoff_zone_10,dropoff_zone_11,dropoff_zone_12,dropoff_zone_13,dropoff_zone_14,dropoff_zone_15,dropoff_zone_16,dropoff_zone_17,dropoff_zone_18,dropoff_zone_19
0,1,455,3,74,0,-0.965926,-0.258819,0.9749279,-0.222521,99.970196,...,0,0,0,1,0,0,0,0,0,0
1,1,663,6,164,0,0.0,1.0,0.7818315,0.62349,242.846232,...,0,1,0,0,0,0,0,0,0,0
2,1,429,4,97,0,-0.965926,0.258819,-0.4338837,-0.900969,187.2623,...,0,0,0,0,0,0,0,0,0,0
3,1,435,3,86,1,-0.258819,-0.965926,-2.449294e-16,1.0,179.473585,...,0,0,0,0,0,0,0,0,0,1
4,6,443,1,30,1,-0.5,0.866025,-2.449294e-16,1.0,315.004404,...,0,0,0,0,0,0,0,0,0,0


In [130]:
# Define X and y correctly
y = df['trip_duration'].values  # y should only contain the target column, trip_duration
X = df.drop(columns=['trip_duration']).values  # X contains all features except for trip_duration


In [131]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Check the shapes
print(X_train.shape)  # Expected output: (22500, n_features)
print(y_train.shape)  # Expected output: (22500,)

(35593, 49)
(35593,)


Log transform the target first

y_train_log = np.log1p(y_train)  # log1p instead of log to handle zeros

y_test_log = np.log1p(y_test)

In [132]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [133]:
# Keep StandardScaler
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train_log.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test_log.reshape(-1, 1))

# Check the shapes after scaling
print(y_train_scaled.shape)  # Should be (22500, 1)
print(y_test_scaled.shape)  # Should be (22500, 1)

(35593, 1)
(11865, 1)


In [173]:
# Define the model architecture (same as before)
nn_model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')  # Output layer
])
# Compile the model (same as before)
nn_model.compile(
    loss=tf.keras.losses.Huber(delta=1.0),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['mae', tf.keras.metrics.R2Score()]
)
# Add early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)
# Train the model with more epochs and validation
history = nn_model.fit(
    X_train_scaled,
    y_train_scaled,
    epochs=50,
    batch_size=32,
    validation_split=0.2,  # Add validation monitoring
    callbacks=[early_stopping]
)

Epoch 1/50



Argument `input_shape` is deprecated. Use `shape` instead.



[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - loss: 0.2902 - mae: 0.6214 - r2_score: 0.2446 - val_loss: 0.2387 - val_mae: 0.5480 - val_r2_score: 0.4076
Epoch 2/50
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.2254 - mae: 0.5330 - r2_score: 0.4479 - val_loss: 0.2252 - val_mae: 0.5298 - val_r2_score: 0.4566
Epoch 3/50
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.2191 - mae: 0.5218 - r2_score: 0.4474 - val_loss: 0.2255 - val_mae: 0.5309 - val_r2_score: 0.4589
Epoch 4/50
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.2117 - mae: 0.5120 - r2_score: 0.4649 - val_loss: 0.2254 - val_mae: 0.5325 - val_r2_score: 0.4650
Epoch 5/50
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.2098 - mae: 0.5090 - r2_score: 0.4834 - val_loss: 0.2269 - val_mae: 0.5319 - val_r2_score: 0.4546
Epoch 6/50
[1m890/890[0m [32m━━━━━━━

In [163]:
# Make predictions on the test set (scaled)
y_pred_scaled = nn_model.predict(X_test_scaled)

# Rescale the predictions back to the original range of trip_duration
y_pred_rescaled = target_scaler.inverse_transform(y_pred_scaled)

[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [172]:
# Calculate MAE and MSE on the rescaled predictions
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred_rescaled)
mse = mean_squared_error(y_test, y_pred_rescaled)
r2 = r2_score(y_test, y_pred_rescaled)

print(f"Test MAE (in original scale): {mae}")
print(f"Test MSE (in original scale): {mse}")
print(f"Test R2 Score: {r2}")


Test MAE (in original scale): 723.2311917431587
Test MSE (in original scale): 720807.5652912519
Test R2 Score: -2.639204740524292
