In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
print("Environment setup complete")

In [None]:
# Phase 1: Load Data
from sklearn.datasets import load_boston
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target, name='PRICE')

print(f"Dataset shape: {X.shape}")
print(f"\nFeatures: {list(X.columns)}")
print(f"Target range: ${y.min():.1f}k - ${y.max():.1f}k")
print(f"\nDataset Info:")
print(X.describe())

In [None]:
# Phase 2: Exploratory Data Analysis
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes[0, 0].hist(y, bins=30, edgecolor='black')
axes[0, 0].set_title('Price Distribution')
axes[0, 0].set_xlabel('Price ($1000s)')

axes[0, 1].scatter(X['RM'], y, alpha=0.5)
axes[0, 1].set_title('Rooms vs Price')
axes[0, 1].set_xlabel('Avg Rooms')
axes[0, 1].set_ylabel('Price')

axes[1, 0].scatter(X['LSTAT'], y, alpha=0.5)
axes[1, 0].set_title('Low Status % vs Price')
axes[1, 0].set_xlabel('LSTAT (%)')
axes[1, 0].set_ylabel('Price')

corr = X.corr()['RM'].sort_values(ascending=False)
axes[1, 1].barh(corr.index[:10], corr.values[:10])
axes[1, 1].set_title('Top 10 Features Correlation')

plt.tight_layout()
plt.show()

In [None]:
# Phase 3: Preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")

In [None]:
# Phase 4: Model Building - Linear Models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}
    print(f"{name}: MSE={mse:.3f}, R²={r2:.3f}")

results_df = pd.DataFrame(results).T
print(f"\n{results_df}")

In [None]:
# Phase 5: Model Building - Tree-Based Models
tree_models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
}

for name, model in tree_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}
    print(f"{name}: MSE={mse:.3f}, R²={r2:.3f}")

results_df = pd.DataFrame(results).T
print(f"\n{results_df}")

In [None]:
# Phase 6: Hyperparameter Tuning
rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None]},
    cv=5, n_jobs=-1
)
rf_grid.fit(X_train, y_train)
print(f"Best RF params: {rf_grid.best_params_}")
print(f"Best CV score: {rf_grid.best_score_:.3f}")

best_rf = rf_grid.best_estimator_
y_pred_best = best_rf.predict(X_test)
print(f"Test R²: {r2_score(y_test, y_pred_best):.3f}")

In [None]:
# Phase 7: Model Comparison
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.bar(results_df.index, results_df['MSE'])
plt.ylabel('MSE')
plt.title('Mean Squared Error Comparison')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
plt.bar(results_df.index, results_df['R2'])
plt.ylabel('R² Score')
plt.title('R² Score Comparison')
plt.xticks(rotation=45)
plt.ylim([0, 1])

plt.tight_layout()
plt.show()

print(f"\nBest Model: {results_df['R2'].idxmax()}")
print(f"Best R²: {results_df['R2'].max():.3f}")

In [None]:
# Phase 8: Feature Importance
importances = best_rf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

print("Top 5 Features:")
print(importance_df.head())

In [None]:
# Phase 9: Residual Analysis
y_pred_final = best_rf.predict(X_test)
residuals = y_test - y_pred_final

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].scatter(y_pred_final, residuals, alpha=0.5)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residual Plot')

axes[1].hist(residuals, bins=20, edgecolor='black')
axes[1].set_xlabel('Residuals')
axes[1].set_title('Residual Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Phase 10: Testing and Validation
test_results = []

# Test 1: Data loaded correctly
test1 = X.shape[0] == 506 and len(y) == 506
test_results.append(("Test 1: Dataset loaded", test1))

# Test 2: Preprocessing done
test2 = X_train_scaled.shape[0] > 0 and X_test_scaled.shape[0] > 0
test_results.append(("Test 2: Train-test split", test2))

# Test 3: Linear models trained
test3 = all(hasattr(m, 'coef_') for m in models.values())
test_results.append(("Test 3: Linear models trained", test3))

# Test 4: Tree models trained
test4 = all(hasattr(m, 'feature_importances_') for m in tree_models.values())
test_results.append(("Test 4: Tree models trained", test4))

# Test 5: Best model has good R²
test5 = r2_score(y_test, y_pred_final) > 0.7
test_results.append(("Test 5: Best model R² > 0.7", test5))

print("\n" + "="*60)
print("PRACTICAL 14: BOSTON HOUSING - TEST RESULTS")
print("="*60)
passed = sum(1 for _, r in test_results if r)
for test_name, result in test_results:
    status = "✅ PASS" if result else "❌ FAIL"
    print(f"{status} | {test_name}")
print(f"\nTotal: {passed}/{len(test_results)} tests passed")
print("="*60)

In [None]:
# Summary
print(f"""
PROJECT SUMMARY
===============
Dataset: Boston Housing (506 samples, 13 features)
Task: Regression (Predict house prices)

Models Tested:
{results_df.to_string()}

Best Model: {results_df['R2'].idxmax()}
Final R² Score: {r2_score(y_test, y_pred_final):.3f}
Final RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred_final)):.2f}k
Final MAE: ${mean_absolute_error(y_test, y_pred_final):.2f}k

Top 3 Important Features:
{importance_df.head(3).to_string(index=False)}

Status: ✅ COMPLETE
""")