In [None]:

# Assignment ML2: Uber Fare Prediction (Linear, Ridge, Lasso)
# Educational version: clear comments and printed outputs.
# NOTE: Replace './uber.csv' with your dataset path if different.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (user should provide file)
path = "./uber.csv"
try:
    df = pd.read_csv(path)
    print("Loaded dataset:", path)
except Exception as e:
    print("Could not load './uber.csv' â€” generating synthetic sample dataset instead. Replace with your file for real results.\nError:", e)
    np.random.seed(42)
    n = 1000
    df = pd.DataFrame({
        "pickup_latitude": np.random.uniform(40.5, 41.0, n),
        "pickup_longitude": np.random.uniform(-74.3, -73.7, n),
        "dropoff_latitude": np.random.uniform(40.5, 41.0, n),
        "dropoff_longitude": np.random.uniform(-74.3, -73.7, n),
        "passenger_count": np.random.randint(1, 5, n),
        "distance_km": np.random.exponential(2, n),
    })
    # synthetic fare (target)
    df["fare_amount"] = 2.5 + 1.2 * df["distance_km"] + 0.5 * df["passenger_count"] + np.random.normal(0, 2, n)

print("Dataset shape:", df.shape)
display(df.head())

# Basic preprocessing
# Drop rows with missing target
df = df.dropna(subset=["fare_amount"])
# Identify numeric and categorical columns automatically
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != "fare_amount"]
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

print("\nNumeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)

# Outlier detection using IQR for target
Q1 = df["fare_amount"].quantile(0.25)
Q3 = df["fare_amount"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df["fare_amount"] < lower) | (df["fare_amount"] > upper)]
print(f"\nDetected {len(outliers)} outliers in fare_amount (using IQR).")
# For demonstration, we won't drop them automatically; show distribution
plt.figure(figsize=(6,3))
sns.boxplot(x=df["fare_amount"])
plt.title("Fare Amount Boxplot")
plt.show()

# Correlation (numeric features)
if numeric_cols:
    corr = df[numeric_cols + ["fare_amount"]].corr()
    print("\nCorrelation with target (fare_amount):")
    print(corr["fare_amount"].sort_values(ascending=False))
    plt.figure(figsize=(6,5))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
    plt.title("Correlation matrix")
    plt.show()

# Define X and y
X = df.drop(columns=["fare_amount"])
y = df["fare_amount"]

# Preprocessing pipeline
num_transformer = Pipeline(steps=[("scaler", StandardScaler())])
cat_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, numeric_cols),
    ("cat", cat_transformer, categorical_cols)
], remainder="drop")

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to evaluate
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Lasso": Lasso(alpha=0.1, random_state=42)
}

results = {}
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    r2 = r2_score(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    results[name] = {"r2": r2, "rmse": rmse}
    print(f"\n{name} -> R2: {r2:.4f}, RMSE: {rmse:.4f}")

# Show a summary table
res_df = pd.DataFrame(results).T
display(res_df)

# Cross-validation example on Linear Regression
from sklearn.model_selection import cross_val_score
pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", LinearRegression())])
cv_scores = cross_val_score(pipe, X, y, cv=5, scoring="r2")
print("\nLinearRegression CV R2 scores:", cv_scores)
print("Mean CV R2:", cv_scores.mean())

print("\nDone. Replace synthetic data with your real './uber.csv' and re-run for actual results.")
