In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df = pd.read_csv("uber - uber.csv")
# Drop unnamed index column if it exists
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

In [None]:
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Missing Values:\n", df.isnull().sum())
print(df.describe())

In [None]:
# Basic EDA visualization
plt.figure(figsize=(10, 5))
sns.histplot(df['fare_amount'], bins=50, kde=True, color='skyblue')
plt.title('Distribution of Fare Amounts')
plt.xlabel('Fare Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10,8))
corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr_matrix, cmap="coolwarm", annot=True, fmt='.2f', square=True, linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()

In [None]:
# ----- Data Preprocessing -----
df = df.dropna()
df_num = df.select_dtypes(include=['float64', 'int64'])
target = [c for c in df_num.columns if 'price' in c.lower() or 'fare' in c.lower()]
y = df_num[target[0]]
X = df_num.drop(columns=[target[0]])

In [None]:
# ----- Train/Test Split -----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# ----- Without PCA -----
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
lr = LinearRegression().fit(X_train_s, y_train)
pred = lr.predict(X_test_s)

In [None]:
r2_no_pca = r2_score(y_test, pred)
rmse_no_pca = mean_squared_error(y_test, pred)
print("\nWithout PCA -> R2:", r2_no_pca, "RMSE:", rmse_no_pca)

In [None]:
# ----- With PCA -----
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_s)
X_test_pca = pca.transform(X_test_s)
lr_pca = LinearRegression().fit(X_train_pca, y_train)
pred_pca = lr_pca.predict(X_test_pca)

In [None]:
r2_pca = r2_score(y_test, pred_pca)
rmse_pca = mean_squared_error(y_test, pred_pca)
print("With PCA -> R2:", r2_pca, "RMSE:", rmse_pca)
print("No. of PCA Components:", pca.n_components_)

In [None]:
# ----- Visual Comparisons -----
# Explained Variance Plot
plt.figure(figsize=(8,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()

# R2 & RMSE Comparison
plt.figure(figsize=(6,4))
metrics = pd.DataFrame({
    'Metric': ['R2 Score', 'RMSE'],
    'Without PCA': [r2_no_pca, rmse_no_pca],
    'With PCA': [r2_pca, rmse_pca]
})
metrics.set_index('Metric').plot(kind='bar', figsize=(6,4), color=['steelblue','orange'])
plt.title('Model Performance: With vs Without PCA')
plt.ylabel('Value')
plt.grid(True)
plt.show()

# Actual vs Predicted (Optional for better understanding)
plt.figure(figsize=(8,5))
sns.scatterplot(x=y_test, y=pred, label='Without PCA', alpha=0.5)
sns.scatterplot(x=y_test, y=pred_pca, label='With PCA', alpha=0.5)
plt.xlabel("Actual Fare")
plt.ylabel("Predicted Fare")
plt.title("Actual vs Predicted Comparison")
plt.legend()
plt.show()