In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
ds = pd.read_csv('uber.csv')
ds

In [None]:
ds = ds.drop(['Unnamed: 0', 'pickup_datetime'], axis = 1)
ds.dropna(inplace=True)
ds

In [None]:
ds["key"] = pd.to_datetime(ds["key"]).dt.strftime("%H").astype('float32')
ds = ds.rename(columns={"key": "time"})
ds
    

In [None]:
import math

def euclidean_distance(lat1, lon1, lat2, lon2):
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    distance = math.sqrt(dlat**2 + dlon**2)
    return distance

ds['distance'] = ds.apply(
    lambda row: euclidean_distance(
        row['pickup_latitude'],
        row['pickup_longitude'],
        row['dropoff_latitude'],
        row['dropoff_longitude']
    ), axis=1
)
     

In [None]:
sns.scatterplot(ds, y="fare_amount", x="passenger_count")
ds = ds[ds["passenger_count"] < 100]

In [None]:
def remove_outliers(feature):
    global ds
    q3 , q1 = np.percentile( ds[feature] , [ 75 , 25 ] )
    iqr = q3 - q1
    ds = ds[ (ds[feature] >= q1 - 1.5 * iqr) & (ds[feature] <= q3 + 1.5 * iqr) ]

remove_outliers("pickup_latitude")
remove_outliers("pickup_longitude")
remove_outliers("dropoff_latitude")
remove_outliers("dropoff_longitude")

In [None]:
corr = ds.corr(method="pearson")
sns.heatmap(corr, annot=True)
     

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = ds.drop(['fare_amount'], axis=1)
y = ds['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

lr_RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE -> ", lr_RMSE)
lr_R2 = r2_score(y_test, y_pred)
print("R2 -> ", lr_R2)
lr_mae = mean_absolute_error(y_test, y_pred)
print("MAE -> ", lr_mae)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred
     

In [None]:
rf_RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE -> ", rf_RMSE)
rf_R2 = r2_score(y_test, y_pred)
print("R2 -> ", rf_R2)
rf_mae = mean_absolute_error(y_test, y_pred)
print("MAE -> ", rf_mae)