In [1]:
# pip install pandas numpy scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# 1. LOAD DATA
df = pd.read_csv("uber_fare.csv")  # rename file if needed

# 2. BASIC CLEANING
df = df.dropna()

# remove weird/unrealistic values
df = df[(df.fare_amount > 0) & (df.fare_amount < 200)]
df = df[(df.passenger_count > 0) & (df.passenger_count <= 6)]

# Convert datetime
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

# Extract basic time features
df["hour"] = df["pickup_datetime"].dt.hour
df["day"] = df["pickup_datetime"].dt.day
df["month"] = df["pickup_datetime"].dt.month

# For simplicity, use coordinates directly + time features
X = df[["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude",
        "passenger_count","hour","day","month"]]
y = df["fare_amount"]

# 3. TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. TRAIN MODELS
lr = LinearRegression()
rf = RandomForestRegressor(random_state=42)

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

# 5. PREDICT
lr_p = lr.predict(X_test)
rf_p = rf.predict(X_test)

# 6. EVALUATE FUNCTION
def evaluate(name, y_true, y_pred):
    print(f"\n{name}")
    print("R2:", r2_score(y_true, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("MAE:", mean_absolute_error(y_true, y_pred))

# 7. RESULTS
evaluate("Linear Regression", y_test, lr_p)
evaluate("Random Forest", y_test, rf_p)




Linear Regression
R2: 0.0015180716145960504
RMSE: 9.753347423157859
MAE: 6.0399183340491645

Random Forest
R2: 0.771058568967941
RMSE: 4.67031083840896
MAE: 2.2260202242722675
