In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import seaborn as sns
from math import sqrt
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('uber.csv')
data.head()

In [None]:
data.shape

# Data Preprocessing

In [None]:
data.isnull().sum()
data.drop(columns='Unnamed: 0',inplace=True)

In [None]:
data = data.dropna() #to remove missing values. 

In [None]:
data.drop_duplicates(inplace=True)
data.head()

In [None]:
data.drop(columns=['key','pickup_datetime'],inplace=True)

X = data.drop(['fare_amount'],axis=1)
y = data['fare_amount']
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

# Correlation

In [None]:
data.corr()

# Outliers Detection 

In [None]:
sns.boxplot(data['fare_amount'])

In [None]:
Q1 = np.percentile(data['fare_amount'], 25, interpolation = 'midpoint')
Q3 = np.percentile(data['fare_amount'], 75, interpolation = 'midpoint')
print(Q1,Q3)

# Outlier Treatment

In [None]:
data.drop(data[data['fare_amount'].values>=12.5].index,inplace=True)
sns.boxplot(data['fare_amount'])

In [None]:
linear_reg_model=LinearRegression()
linear_reg_model.fit(x_train,y_train)
y_pred_linear=linear_reg_model.predict(x_test)
def evaluate_model(y_true, y_pred, model_name):
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} RMSE: {rmse:.2f}")
    print(f"{model_name} R-squared (R2): {r2:.2f}")

print("Linear Regression Model:")
evaluate_model(y_test, y_pred_linear, "Linear Regression")

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)
def evaluate_model(y_true, y_pred, model_name):
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} RMSE: {rmse:.2f}")
    print(f"{model_name} R-squared (R2): {r2:.2f}")

print("\nRandom Forest Regression Model:")
evaluate_model(y_test, y_pred_rf, "Random Forest Regression")