In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from geopy.distance import great_circle

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('uber.csv').drop(['Unnamed: 0', 'key'], axis=1)

df.head()

df.isna().sum()

df.dropna(inplace = True)

df.isna().sum()

df.shape

df['month'] = df['pickup_datetime'].str[5:7]
df['hour'] = df['pickup_datetime'].str[11:13]
df.drop(['pickup_datetime'], axis=1, inplace=True)

df = df[~((df['pickup_latitude'] < -90) | (df['pickup_latitude'] > 90) | (df['dropoff_latitude'] < -90) | (df['dropoff_latitude'] > 90))]
df['distance'] = df.apply(lambda row: great_circle((row['pickup_latitude'], row['pickup_longitude']), (row['dropoff_latitude'], row['dropoff_longitude'])).kilometers, axis=1)

df.plot(kind="box", subplots=True, layout=(6, 2), figsize=(15, 20))

def remove_outliers(df, col):
    Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
    IQR = Q3 - Q1
    return np.clip(df[col], Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
cols = ['fare_amount', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'distance']
for col in cols:
    df[col] = remove_outliers(df, col)

df.plot(kind="box", subplots=True, layout=(6, 2), figsize=(15, 20))

sns.heatmap(df.corr(), annot=True)

x = df.drop('fare_amount', axis=1)
y = df['fare_amount']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20, random_state=42)

lr = LinearRegression().fit(xtrain, ytrain)
ypredlr = lr.predict(xtest)

print('Linear Regression Metrics:')
print('Mean Absolute Error:', metrics.mean_absolute_error(ytest, ypredlr))
print('Mean Squared Error:', metrics.mean_squared_error(ytest, ypredlr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypredlr)))
print('R Squared (R²):', metrics.r2_score(ytest, ypredlr))

rf = RandomForestRegressor(n_estimators=10, random_state=42).fit(xtrain, ytrain)
ypredrf = rf.predict(xtest)

print('Random Forest Metrics:')
print('Mean Absolute Error:', metrics.mean_absolute_error(ytest, ypredrf))
print('Mean Squared Error:', metrics.mean_squared_error(ytest, ypredrf))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypredrf)))
print('R Squared (R²):', metrics.r2_score(ytest, ypredrf))