##  Imports

In [None]:
import sys
import os
sys.path.append('../src')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline
print("All libraries imported successfully!")

## Loading 

In [None]:

file_path = "../data/california_houses/California_Houses.csv"
data = pd.read_csv(file_path)
print(" DATASET OVERVIEW")
print("=" * 50)
print(f"Dataset shape: {data.shape}")
print(f"\nColumns: {list(data.columns)}")
print(f"\nFirst 5 rows:")
data.head()

##  Visualization

In [None]:

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(data['Median_House_Value'], bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('Median House Value ($)')
plt.ylabel('Frequency')
plt.title('Distribution of House Prices')
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.hist(data['Median_Income'], bins=50, alpha=0.7, edgecolor='black', color='orange')
plt.xlabel('Median Income (Tens of Thousands $)')
plt.ylabel('Frequency')
plt.title('Distribution of Income')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Manual Regression 

In [None]:
from src.models import ManualLinearRegression
print(" MANUAL REGRESSION IMPLEMENTATIONS")
print("=" * 60)

###  Manual Linear Regression

In [None]:

X_train_bias = np.c_[np.ones(X_train_scaled.shape[0]), X_train_scaled]
X_val_bias = np.c_[np.ones(X_val_scaled.shape[0]), X_val_scaled]
X_test_bias = np.c_[np.ones(X_test_scaled.shape[0]), X_test_scaled]
manual_linear = ManualLinearRegression(alpha=0, regularization='none')
manual_linear.fit(X_train_bias, y_train, method='normal')
y_pred_manual_linear = manual_linear.predict(X_test_bias)
mse_manual_linear = mean_squared_error(y_test, y_pred_manual_linear)
mae_manual_linear = mean_absolute_error(y_test, y_pred_manual_linear)
r2_manual_linear = r2_score(y_test, y_pred_manual_linear)
print("MANUAL LINEAR REGRESSION RESULTS")
print("-" * 40)
print(f"Mean Squared Error:  {mse_manual_linear:.2f}")
print(f"Mean Absolute Error: ${mae_manual_linear:.2f}")
print(f"R² Score:           {r2_manual_linear:.4f}")
print(f"\nModel explains {r2_manual_linear*100:.1f}% of house price variance")

##  Scikit-Learn Implementation

In [None]:
print(" SCIKIT-LEARN IMPLEMENTATIONS")
print("=" * 60)
sklearn_linear = LinearRegression()
sklearn_linear.fit(X_train_scaled, y_train)
y_pred_sklearn_linear = sklearn_linear.predict(X_test_scaled)
mse_sklearn_linear = mean_squared_error(y_test, y_pred_sklearn_linear)
mae_sklearn_linear = mean_absolute_error(y_test, y_pred_sklearn_linear)
r2_sklearn_linear = r2_score(y_test, y_pred_sklearn_linear)
print(" SCIKIT-LEARN LINEAR REGRESSION RESULTS")
print("-" * 40)
print(f"Mean Squared Error:  {mse_sklearn_linear:.2f}")
print(f"Mean Absolute Error: ${mae_sklearn_linear:.2f}")
print(f"R² Score:           {r2_sklearn_linear:.4f}")

##   Comparison

In [None]:

models = ['Linear']
manual_mses = [mse_manual_linear]
sklearn_mses = [mse_sklearn_linear]
manual_r2 = [r2_manual_linear]
sklearn_r2 = [r2_sklearn_linear]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
x = np.arange(len(models))
width = 0.35
ax1.bar(x - width/2, manual_mses, width, label='Manual', alpha=0.7, color='blue')
ax1.bar(x + width/2, sklearn_mses, width, label='Scikit-Learn', alpha=0.7, color='red')
ax1.set_xlabel('Regression Model')
ax1.set_ylabel('Mean Squared Error')
ax1.set_title('MSE Comparison: Manual vs Scikit-Learn')
ax1.set_xticks(x)
ax1.set_xticklabels(models)
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')
ax2.bar(x - width/2, manual_r2, width, label='Manual', alpha=0.7, color='blue')
ax2.bar(x + width/2, sklearn_r2, width, label='Scikit-Learn', alpha=0.7, color='red')
ax2.set_xlabel('Regression Model')
ax2.set_ylabel('R² Score')
ax2.set_title('R² Comparison: Manual vs Scikit-Learn')
ax2.set_xticks(x)
ax2.set_xticklabels(models)
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## Findings

In [None]:
print("🔍 KEY FINDINGS AND INSIGHTS")
print("=" * 50)

print(f"\n1. MODEL PERFORMANCE:")
print(f"   • R² Score: {r2_manual_linear:.4f} ({r2_manual_linear*100:.1f}% variance explained)")
print(f"   • Average prediction error: ${mae_manual_linear:.2f}")

print(f"\n2. IMPLEMENTATION ACCURACY:")
mse_difference = abs(mse_manual_linear - mse_sklearn_linear)
print(f"   • MSE difference: {mse_difference:.2f}")
print(f"   • Manual vs Scikit-Learn: {'PERFECT MATCH' if mse_difference < 0.01 else 'EXCELLENT' if mse_difference < 1 else 'GOOD'}")
print(f"\n3. DATASET ANALYSIS:")
print(f"   • Samples: {data.shape[0]}")
print(f"   • Features: {data.shape[1] - 1}")
print(f"   • Price range: ${data['Median_House_Value'].min():.0f} - ${data['Median_House_Value'].max():.0f}")
print(f"\nManual implementation matches scikit-learn!")