In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
df = pd.read_csv('uber.csv')
df.head()

In [None]:
# Removing missing values
print(df.isnull().sum())
df.dropna(inplace=True)
print(df.isnull().sum())

In [None]:
df.shape
df.dtypes
df.info()

In [None]:
# Convert pickup_datetime to datetime format
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')

In [None]:
# Extract useful features from pickup_datetime
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_weekday'] = df['pickup_datetime'].dt.weekday

In [None]:
# Drop cols as its not required
df.drop(['pickup_datetime', 'key', 'Unnamed: 0'], axis=1, inplace=True)

In [None]:
# Box plots before removing outliers
plt.figure(figsize=(15,6))
sns.boxplot(data=df)
plt.show()

In [None]:
# Remove coordinates which are invalid
incorrect_coordinates = (df.pickup_latitude>90)|(df.pickup_latitude<-90)|(df.dropoff_latitude>90)|(df.dropoff_latitude<-90)|(df.pickup_longitude>180)|(df.pickup_longitude<-180)|(df.dropoff_longitude>90)|(df.dropoff_longitude<-90)
df = df[~incorrect_coordinates]

In [None]:
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75) 
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (df[column_name] < lower_bound) | (df[column_name] > upper_bound)
    new_df = df[~outliers]
    return new_df

In [None]:
new_df = df.copy()
new_df.head()
for col in df.columns:
    new_df = remove_outliers(new_df, col)
new_df.head()
df = new_df

In [None]:
# Calculate dstance
def get_distance(longx, latx, longy, laty):
    diffLat = laty-latx
    diffLong = longy-longx
    return np.sqrt(diffLat**2+diffLong**2)
df['distance'] = get_distance(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])

In [None]:
# Cleaned dataset
df.head()

In [None]:
# Plot correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
# Spliting train and test data
X = df[["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "distance"]]
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# LR MODEL

# Fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on test data
y_pred_lr = lr_model.predict(X_test)

In [None]:
# RF MODEL

# Fit the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Evaluating the results

def evaluate_model(y_test, y_pred, model_name):
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{model_name} - R2 Score: {r2}, RMSE: {rmse}, MAE: {mae}")

evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")