In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from datetime import datetime
from geopy.distance import geodesic

# Step 1: Load the NYC Taxi Dataset


In [9]:
data = pd.read_csv('NYC.csv')

In [15]:
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [16]:
# Preview the dataset
print("Dataset preview:")
print(data.head())

Dataset preview:
          id  vendor_id      pickup_datetime     dropoff_datetime  \
0  id2875421          2  2016-03-14 17:24:55  2016-03-14 17:32:30   
1  id2377394          1  2016-06-12 00:43:35  2016-06-12 00:54:38   
2  id3858529          2  2016-01-19 11:35:24  2016-01-19 12:10:48   
3  id3504673          2  2016-04-06 19:32:31  2016-04-06 19:39:40   
4  id2181028          2  2016-03-26 13:30:55  2016-03-26 13:38:10   

   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0                1        -73.982155        40.767937         -73.964630   
1                1        -73.980415        40.738564         -73.999481   
2                1        -73.979027        40.763939         -74.005333   
3                1        -74.010040        40.719971         -74.012268   
4                1        -73.973053        40.793209         -73.972923   

   dropoff_latitude store_and_fwd_flag  trip_duration  
0         40.765602                  N            455  

In [18]:
data = data.dropna(subset=["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",  "trip_duration"])


# Filter out invalid or unrealistic data


In [19]:
data = data[(data["pickup_longitude"] != 0) & (data["pickup_latitude"] != 0)]
data = data[(data["dropoff_longitude"] != 0) & (data["dropoff_latitude"] != 0)]
data = data[(data["trip_duration"] > 0) & (data["trip_duration"] < 7200)]  # Remove outliers (duration < 2 hours)


# Add feature: Trip distance (using geodesic distance between pickup and dropoff points)


In [20]:
def calculate_distance(row):
    pickup = (row["pickup_latitude"], row["pickup_longitude"])
    dropoff = (row["dropoff_latitude"], row["dropoff_longitude"])
    return geodesic(pickup, dropoff).kilometers

In [21]:
data["trip_distance"] = data.apply(calculate_distance, axis=1)


# Add feature: Day of the week and hour of the day


In [22]:
data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"])
data["day_of_week"] = data["pickup_datetime"].dt.dayofweek
data["hour_of_day"] = data["pickup_datetime"].dt.hour

# Select relevant features and target


In [23]:
X = data[["trip_distance", "day_of_week", "hour_of_day", "passenger_count"]]
y = data["trip_duration"]

# Step 3: Train-Test Split


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 4: Train the Random Forest Regressor


In [25]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Evaluate the Model


In [26]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


In [27]:
print("\nModel Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f} seconds")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} seconds")


Model Performance:
Mean Absolute Error (MAE): 257.93 seconds
Root Mean Squared Error (RMSE): 385.68 seconds


In [28]:
# Step 6: Example Prediction
sample_data = X_test.iloc[0]  # Example row from the test set
predicted_duration = model.predict([sample_data])[0]
print("\nExample Prediction:")
print(f"Input features: {sample_data.to_dict()}")
print(f"Predicted trip duration: {predicted_duration:.2f} seconds")


Example Prediction:
Input features: {'trip_distance': 1.6504515245614915, 'day_of_week': 4.0, 'hour_of_day': 17.0, 'passenger_count': 5.0}
Predicted trip duration: 580.56 seconds


