In [None]:
# pip install numpy pandas scikit-learn matplotlib seaborn xgboost joblib kaggle
# kaggle datasets download -d yasserh/uber-fares-dataset
# unzip uber-fares-dataset.zip -d uber_fares
# You'll get a CSV file (adjust path/name if different)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# ---------------------------
# 1. Load Dataset
# ---------------------------
df = pd.read_csv("uber.csv")   # path to your file
print("Initial shape:", df.shape)
print(df.head())

# ---------------------------
# 2. Basic Preprocessing
# ---------------------------
# Convert pickup_datetime to datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')

# Drop missing values
df = df.dropna()

# Remove negative or zero fares
df = df[df['fare_amount'] > 0]

# Compute distance (Haversine formula)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return R * (2 * np.arcsin(np.sqrt(a)))

df['distance_km'] = haversine(df['pickup_latitude'], df['pickup_longitude'],
                              df['dropoff_latitude'], df['dropoff_longitude'])

# Remove unrealistic distances
df = df[(df['distance_km'] > 0) & (df['distance_km'] < 100)]

# ---------------------------
# 3. Identify Outliers
# ---------------------------
# IQR method for fare_amount
Q1 = df['fare_amount'].quantile(0.25)
Q3 = df['fare_amount'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

print("Fare range (IQR filter):", lower, "to", upper)

# Remove outliers in fare
df = df[(df['fare_amount'] >= lower) & (df['fare_amount'] <= upper)]
print("After removing outliers:", df.shape)

# ---------------------------
# 4. Check Correlation
# ---------------------------
corr = df[['fare_amount', 'distance_km', 'passenger_count']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# ---------------------------
# 5. Train Models
# ---------------------------
X = df[['distance_km', 'passenger_count']]
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# ---------------------------
# 6. Evaluation Metrics
# ---------------------------
def evaluate(y_true, y_pred, model_name):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"\n{model_name} Results:")
    print(f"RÂ² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")

evaluate(y_test, y_pred_lr, "Linear Regression")
evaluate(y_test, y_pred_rf, "Random Forest Regression")

# Optional: Compare visually
plt.figure(figsize=(6,4))
plt.scatter(y_test, y_pred_rf, color='blue', alpha=0.4, label='Random Forest')
plt.scatter(y_test, y_pred_lr, color='red', alpha=0.4, label='Linear Regression')
plt.xlabel("Actual Fare")
plt.ylabel("Predicted Fare")
plt.legend()
plt.title("Actual vs Predicted Fares")
plt.show()


: 