# Cab Fare Prediction
## Data Preprocessing (Final Version)

In [None]:

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from math import radians


### Load Dataset

In [None]:

df = pd.read_csv("train_cab_fare.csv")


### Clean Target Variable

In [None]:

df["fare_amount"] = pd.to_numeric(df["fare_amount"], errors="coerce")
df = df.dropna(subset=["fare_amount"])
df = df[(df["fare_amount"] > 2) & (df["fare_amount"] < 100)]


### Median Imputation

In [None]:

numeric_columns = df.select_dtypes(include=["int64", "float64"]).columns
numeric_columns = numeric_columns.drop("fare_amount")

imputer = SimpleImputer(strategy="median")
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])


### Feature Engineering: Distance Calculation

In [None]:

def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 6371 * 2 * np.arcsin(np.sqrt(a))

df["distance_km"] = haversine(
    df["pickup_latitude"],
    df["pickup_longitude"],
    df["dropoff_latitude"],
    df["dropoff_longitude"]
)


### Final Feature Set

In [None]:

X = df.drop("fare_amount", axis=1)
y = df["fare_amount"]

print("Preprocessing Completed Successfully")
