In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
# --- LOAD DATA ---
df = pd.read_csv("uber - uber.csv")   # change path if needed
# Drop unnamed index column if it exists
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

# --- EXPLORATORY DATA ANALYSIS (EDA) ---
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing Values:\n", df.isnull().sum())
print("\nSummary Statistics:\n", df.describe())

In [None]:
# Distribution of target variable
plt.figure(figsize=(8,5))
sns.histplot(df['fare_amount'], bins=50, kde=True, color='skyblue')
plt.title("Distribution of Fare Amounts")
plt.xlabel("Fare Amount")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10,8))
corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True, fmt='.2f', square=True, linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()

In [None]:
# --- DATA CLEANING ---
df = df.dropna()
df_num = df.select_dtypes(include=['float64', 'int64'])

# Separate features and target
target = [c for c in df_num.columns if 'fare' in c.lower() or 'price' in c.lower()]
y = df_num[target[0]]
X = df_num.drop(columns=[target[0]])

In [None]:
# --- TRAIN/TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# --- SCALING ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- MODEL WITHOUT PCA ---
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
pred = lr.predict(X_test_scaled)

r2_no_pca = r2_score(y_test, pred)
rmse_no_pca = mean_squared_error(y_test, pred)
mae_no_pca = mean_absolute_error(y_test, pred)

print("\n----- MODEL WITHOUT PCA -----")
print("R²:", r2_no_pca)
print("RMSE:", rmse_no_pca)
print("MAE:", mae_no_pca)

In [None]:
# --- MODEL WITH PCA ---
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
pred_pca = lr_pca.predict(X_test_pca)

r2_pca = r2_score(y_test, pred_pca)
rmse_pca = mean_squared_error(y_test, pred_pca)
mae_pca = mean_absolute_error(y_test, pred_pca)

print("\n----- MODEL WITH PCA -----")
print("R²:", r2_pca)
print("RMSE:", rmse_pca)
print("MAE:", mae_pca)
print("No. of PCA Components:", pca.n_components_)


In [None]:
# --- VISUAL COMPARISONS ---

#Explained Variance by PCA
plt.figure(figsize=(8,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', color='purple')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()

#Comparison of Metrics
metrics = pd.DataFrame({
    'Metric': ['R²', 'RMSE', 'MAE'],
    'Without PCA': [r2_no_pca, rmse_no_pca, mae_no_pca],
    'With PCA': [r2_pca, rmse_pca, mae_pca]
}).set_index('Metric')

metrics.plot(kind='bar', figsize=(7,5), color=['#00BFC4', '#F8766D'])
plt.title("Model Performance: With vs Without PCA")
plt.ylabel("Metric Value")
plt.grid(True)
plt.show()

#Actual vs Predicted Comparison
plt.figure(figsize=(8,5))
sns.scatterplot(x=y_test, y=pred, label='Without PCA', alpha=0.5)
sns.scatterplot(x=y_test, y=pred_pca, label='With PCA', alpha=0.5)
plt.xlabel("Actual Fare")
plt.ylabel("Predicted Fare")
plt.title("Actual vs Predicted (With & Without PCA)")
plt.legend()
plt.show()