# Predicting House Prices
**Dataset:** California Housing Dataset
**Task:** Regression

## Step 1: Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Step 2: Load and Inspect Data

In [None]:
url = 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv'
df = pd.read_csv(url)
print('Shape:', df.shape)
df.info()
df.head()
df.describe()

## Step 3: Data Cleaning & Preprocessing

In [None]:
df.drop_duplicates(inplace=True)
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

## Step 4: Preprocessing Pipeline

In [None]:
num_attribs = X.select_dtypes(include=[np.number]).columns.tolist()
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)
joblib.dump(full_pipeline, 'preprocessing_pipeline.pkl')

## Step 5: Exploratory Data Analysis (EDA) & Visualizations

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df['median_house_value'], bins=50, kde=True)
plt.title('Distribution of Median House Value')
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x='median_income', y='median_house_value', data=df, alpha=0.5)
plt.title('Median Income vs. House Value')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x=df['housing_median_age'])
plt.title('Boxplot of Housing Median Age')
plt.xlabel('Housing Median Age')
plt.show()

In [None]:
sns.pairplot(df[['median_income','total_rooms','housing_median_age','median_house_value']])
plt.suptitle('Pairplot of Key Features', y=1.02)
plt.show()

In [None]:
# 1) Matplotlib: Histogram of Total Rooms
plt.figure(figsize=(8,6))
plt.hist(df['total_rooms'], bins=50)
plt.title('Distribution of Total Rooms')
plt.xlabel('Total Rooms')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 2) Matplotlib: Average House Value by Ocean Proximity
avg_values = df.groupby('ocean_proximity')['median_house_value'].mean()
plt.figure(figsize=(10,6))
plt.bar(avg_values.index, avg_values.values)
plt.title('Average House Value by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Average Median House Value')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 3) Seaborn: Violin Plot of Median Income by Proximity
plt.figure(figsize=(10,6))
sns.violinplot(x='ocean_proximity', y='median_income', data=df)
plt.title('Median Income Distribution by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Median Income')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 4) Seaborn: Boxplot of Median House Value by Proximity
plt.figure(figsize=(10,6))
sns.boxplot(x='ocean_proximity', y='median_house_value', data=df)
plt.title('House Value Distribution by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Median House Value')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 5) Matplotlib: Line Plot of Average Value vs. Income Deciles
df['income_decile'] = pd.qcut(df['median_income'], 10, labels=False)
decile_avg = df.groupby('income_decile')['median_house_value'].mean()
plt.figure(figsize=(8,6))
plt.plot(decile_avg.index, decile_avg.values, marker='o')
plt.title('Average House Value Across Income Deciles')
plt.xlabel('Income Decile')
plt.ylabel('Average Median House Value')
plt.show()

## Step 6: Model Training & Evaluation

In [None]:
model = LinearRegression()
model.fit(X_train_prepared, y_train)
joblib.dump(model, 'house_price_model.pkl')

y_pred = model.predict(X_test_prepared)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Metrics Summary
metrics_df = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'R²'],
    'Value': [mae, mse, rmse, r2]
})
print('Model Performance Summary:')
print(metrics_df)

In [None]:
# Actual vs Predicted Plot
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title('Actual vs. Predicted House Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
# Residuals vs Predicted Plot
plt.figure(figsize=(8,6))
residuals = y_test - y_pred
sns.scatterplot(x=y_pred, y=residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs. Predicted')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()

In [None]:
# Cross-Validation R² Scores
cv_scores = cross_val_score(model, X_train_prepared, y_train, scoring='r2', cv=5)
print(f"Cross-Validation R²: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

## Conclusion
- Baseline linear model achieved moderate performance (R² ≈ 0.48).
- Future work: advanced models, feature engineering, hyperparameter tuning.