# AAM-IPL Week-3 — Boston Housing Price Prediction (Corrected)
**Converted notebook — cleaned and made robust**  
- Author: K V Amarnath Reddy  
- Roll No.: 229X1A33B2  
- Branch: CSM — Emerging Technologies in Computer Science  
- Date converted: 2025-11-13  

**What changed / improvements:**  
- Robust target detection (`MEDV`, `medv`, `target`) or fallback to the last column.  
- Uses `mean_absolute_percentage_error` when available, otherwise safe fallback.  
- Uses matplotlib for all plots (no seaborn).  
- Provides a small synthetic fallback dataset if the CSV is missing so the notebook cells can be executed for testing.
- Clear save/load checks and friendly messages.


In [None]:
# Requirements (optional)
# If you run into missing packages, install them in your environment, for example:
# pip install scikit-learn matplotlib pandas joblib

print('This cell lists recommended packages. Install them only if missing.')

In [None]:

# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
try:
    # newer sklearn versions include this
    from sklearn.metrics import mean_absolute_percentage_error
    _mape_available = True
except Exception:
    _mape_available = False

print('Libraries loaded. mean_absolute_percentage_error available:', _mape_available)


In [None]:

# Load dataset (search for common filenames in current directory)
possible_filenames = [
    "BostonHousing - BostonHousing.csv",
    "BostonHousing.csv",
    "boston_housing.csv",
    "BostonHousing - BostonHousing Data.csv",
    "Boston.csv"
]

csv_path = None
for fn in possible_filenames:
    if os.path.exists(fn):
        csv_path = fn
        break

if csv_path is None:
    print('No CSV found in the notebook directory.')
    print('If you want to use the original dataset, place one of these filenames in the same folder as the notebook:')
    print('\n'.join(possible_filenames))
    print('\nProceeding with a small synthetic sample dataset so you can run the notebook immediately.')
    # create a small synthetic dataset similar in shape to Boston (13 features + target)
    np.random.seed(0)
    X_synth = np.random.randn(50, 13)
    coef = np.random.randn(13)
    y_synth = X_synth.dot(coef) + np.random.randn(50) * 0.5 + 22.0
    columns = [f'feature_{i+1}' for i in range(X_synth.shape[1])] + ['MEDV']
    df = pd.DataFrame(np.hstack([X_synth, y_synth.reshape(-1,1)]), columns=columns)
    print('Synthetic sample created with shape', df.shape)
else:
    print('Loading dataset from:', csv_path)
    df = pd.read_csv(csv_path)
    print('Dataset shape:', df.shape)

# show top rows
df.head()


In [None]:

# Detect target column
def detect_target_column(df):
    candidates = ['MEDV', 'medv', 'medians', 'target', 'PRICE', 'price']
    for c in candidates:
        if c in df.columns:
            return c
    # fallback to last column
    return df.columns[-1]

target_col = detect_target_column(df)
print('Detected target column:', target_col)


In [None]:

# Watermark helper (optional). Place 'AAM-IPL-Header-6.png' in the notebook folder to use.
def add_watermark(ax, logo_path='AAM-IPL-Header-6.png', alpha=0.25, zoom=0.6):
    try:
        if os.path.exists(logo_path):
            logo = mpimg.imread(logo_path)
            imagebox = OffsetImage(logo, zoom=zoom)
            ab = AnnotationBbox(imagebox, (0.5, 0.5), frameon=False, xycoords='axes fraction', box_alignment=(0.5,0.5), pad=0)
            ax.add_artist(ab)
    except Exception as e:
        print('Watermark error:', e)


In [None]:

# Basic EDA: summary statistics and histograms for features
print('Dataframe shape:', df.shape)
print('\nSummary statistics:')
display(df.describe())

# Histograms for all numeric columns (small sample sets will still show useful plots)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
figsize = (8, 4)
for col in num_cols:
    fig, ax = plt.subplots(figsize=figsize)
    ax.hist(df[col].dropna(), bins=30)
    ax.set_title(f'Histogram: {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    try:
        add_watermark(ax)
    except:
        pass
    plt.tight_layout()
    plt.show()


In [None]:

# Correlation matrix heatmap constructed using matplotlib (no seaborn)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr = df[num_cols].corr()

fig, ax = plt.subplots(figsize=(10,8))
cax = ax.matshow(corr, cmap='coolwarm')
fig.colorbar(cax)
ax.set_xticks(range(len(num_cols)))
ax.set_yticks(range(len(num_cols)))
ax.set_xticklabels(num_cols, rotation=90)
ax.set_yticklabels(num_cols)
ax.set_title('Correlation matrix (matplotlib)')
try:
    add_watermark(ax)
except:
    pass
plt.tight_layout()
plt.show()


In [None]:

# Preprocess: prepare X and y, scale features, and split
target = target_col
X = df.drop(columns=[target])
y = df[target].astype(float)

print('Feature matrix shape:', X.shape)
print('Target vector shape:', y.shape)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print('Train size:', X_train.shape[0], 'Test size:', X_test.shape[0])


In [None]:

# Train Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Model trained.')


In [None]:

# Evaluation metrics: R2, MSE, RMSE, MAE, MAPE (safe)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
if _mape_available:
    try:
        mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    except Exception:
        mape = np.nan
else:
    # safe manual MAPE: ignore zero targets to avoid division by zero
    y_test_arr = np.array(y_test, dtype=float)
    y_pred_arr = np.array(y_pred, dtype=float)
    nonzero_mask = y_test_arr != 0
    if nonzero_mask.sum() == 0:
        mape = np.nan
    else:
        mape = (np.abs((y_test_arr[nonzero_mask] - y_pred_arr[nonzero_mask]) / y_test_arr[nonzero_mask]).mean()) * 100

print(f'R^2: {r2:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'MAPE: {mape:.2f}%' if not np.isnan(mape) else 'MAPE: NaN (could not compute)')


In [None]:

# Plot actual vs predicted (sorted by index for visual clarity)
import pandas as pd
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_series = pd.Series(y_pred).reset_index(drop=True)

fig, ax = plt.subplots(figsize=(10,5))
ax.plot(y_test_series.values, label='Actual', linewidth=2)
ax.plot(y_pred_series.values, label='Predicted', linestyle='--', linewidth=2)
ax.set_xlabel('Sample index (sorted by original order in test set)')
ax.set_ylabel(target_col)
ax.set_title('Actual vs Predicted')
ax.legend()
try:
    add_watermark(ax)
except:
    pass
plt.tight_layout()
plt.show()


In [None]:

# Show coefficients and intercept
feature_names = X.columns.tolist()
coeffs = pd.DataFrame({'feature': feature_names, 'coefficient': model.coef_})
coeffs = coeffs.sort_values(by='coefficient', key=lambda s: s.abs(), ascending=False)
display(coeffs.reset_index(drop=True))
print('Intercept:', float(model.intercept_))


In [None]:

# Save model and scaler for later use
import joblib
joblib.dump(model, 'linear_regression_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
print('Saved model to linear_regression_model.joblib and scaler to scaler.joblib')