<a href="https://colab.research.google.com/github/Dhruv-5903/Housing/blob/main/Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1) Imports ---
# pandas/numpy for data handling
import pandas as pd
import numpy as np

# train/test split and scaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Linear Regression model and metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# pickle to save models/scaler
import pickle


In [None]:
# --- 2) Load dataset ---
df = pd.read_csv("housing.csv")   # change path if needed

# quick look: first rows and basic info
print("Rows, cols:", df.shape)
print(df.head())           # shows first 5 rows
print(df.info())           # data types + non-null counts
print("Missing values per column:\n", df.isnull().sum())
# --- 3) Impute missing values ---
# 'total_bedrooms' commonly has missing values in this dataset.
# We'll replace missing entries with the median (robust to outliers).
if df["total_bedrooms"].isnull().any():
    median_tb = df["total_bedrooms"].median()
    df["total_bedrooms"].fillna(median_tb, inplace=True)
    print(f"Filled total_bedrooms NaNs with median = {median_tb}")
else:
    print("No missing values in total_bedrooms.")


Rows, cols: (20640, 10)
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total_bedrooms"].fillna(median_tb, inplace=True)


In [None]:
# --- 4) Select features (5 independent variables) and target ---
features = [
    "median_income",
    "housing_median_age",
    "total_rooms",
    "population",
    "households"
]

target = "median_house_value"

# Subset the DataFrame
X = df[features].copy()
y = df[target].copy()

print("Feature columns used:", X.columns.tolist())
print("X shape:", X.shape, "y shape:", y.shape)


Feature columns used: ['median_income', 'housing_median_age', 'total_rooms', 'population', 'households']
X shape: (20640, 5) y shape: (20640,)


In [None]:
# --- 5) Train-test split ---
# keep test_size=0.2 for a standard 80/20 split, set random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)


X_train shape: (16512, 5)
X_test shape : (4128, 5)


In [None]:
# --- 6) Scale features ---
# Linear Regression benefits from scaled features (zero mean, unit variance).
scaler = StandardScaler()

# Fit scaler on training data only, then transform train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# If you want DataFrame form for readability, convert back:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=features, index=X_train.index)
X_test_scaled_df  = pd.DataFrame(X_test_scaled, columns=features, index=X_test.index)

print("Scaled training data sample:\n", X_train_scaled_df.head())


Scaled training data sample:
        median_income  housing_median_age  total_rooms  population  households
14196      -0.326196            0.348490     0.222569    0.768276    0.322906
8267       -0.035843            1.618118     0.340293   -0.098901    0.672027
17445       0.144701           -1.952710    -0.342597   -0.449818   -0.430461
14265      -1.017864            0.586545    -0.561490   -0.007434   -0.380587
2271       -0.171488            1.142008    -0.119565   -0.485877   -0.314962


In [None]:
# --- 7) Train Linear Regression ---
lr = LinearRegression()          # create model
lr.fit(X_train_scaled, y_train)  # train on scaled training data

# store intercept & coefficients
intercept = lr.intercept_
coeffs = lr.coef_

print("Intercept:", intercept)
print("Coefficients (in order of features):")
for f, c in zip(features, coeffs):
    print(f"  {f}: {c:.6f}")


Intercept: 207194.6937378876
Coefficients (in order of features):
  median_income: 88013.373819
  housing_median_age: 23385.722514
  total_rooms: -31208.212799
  population: -44870.059344
  households: 82767.245067


In [None]:
# --- 8) Predict on test set and evaluate ---
y_pred = lr.predict(X_test_scaled)

# Metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"R2 Score: {r2:.6f}")
print(f"RMSE    : {rmse:.2f}")
print(f"MAE     : {mae:.2f}")


R2 Score: 0.549001
RMSE    : 76876.06
MAE     : 56722.43


In [None]:
# --- 9) Coefficients dataframe (for reporting) ---
coeff_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": coeffs
})

# Because features were scaled, coefficients correspond to standardized inputs.
print("\nCoefficient table:\n", coeff_df)



Coefficient table:
               Feature   Coefficient
0       median_income  88013.373819
1  housing_median_age  23385.722514
2         total_rooms -31208.212799
3          population -44870.059344
4          households  82767.245067


In [None]:
# --- 11) Example prediction for a new house ---
# Make sure the order of values matches the 'features' list above.

new_house = [
    8.5,   # median_income
    25.0,  # housing_median_age
    2000,  # total_rooms
    800,   # population
    300    # households
]

# Convert to DataFrame with same columns (important: same order and names)
new_df = pd.DataFrame([new_house], columns=features)

# Scale using the previously fitted scaler (don't fit again)
new_scaled = scaler.transform(new_df)

# Predict
pred_price = lr.predict(new_scaled)[0]
print("Predicted median house value for new sample:", round(pred_price, 2))


Predicted median house value for new sample: 404485.04


now 7 variable

In [None]:
# STEP 1: Define features for MODEL-2 (7 variables)

# These were used in Model-1:
# "median_income", "housing_median_age", "total_rooms", "population", "households"
# Now we add: "longitude", "latitude"

features_model2 = [
    "median_income",
    "housing_median_age",
    "total_rooms",
    "population",
    "households",
    "longitude",
    "latitude"
]

target = "median_house_value"   # same target as before

# X contains only the 7 selected columns, y is the target
X2 = df[features_model2].copy()
y2 = df[target].copy()

print("Model-2 features:", X2.columns.tolist())
print("X2 shape:", X2.shape, " y2 shape:", y2.shape)


Model-2 features: ['median_income', 'housing_median_age', 'total_rooms', 'population', 'households', 'longitude', 'latitude']
X2 shape: (20640, 7)  y2 shape: (20640,)


In [None]:
from sklearn.model_selection import train_test_split

# If you already have X_train, X_test from Model-1:
# we reuse their indices to split X2

try:
    # Reuse old split indices (from Model-1)
    X2_train = X2.loc[X_train.index]
    X2_test  = X2.loc[X_test.index]
    y2_train = y2.loc[y_train.index]
    y2_test  = y2.loc[y_test.index]

    print("✅ Reused existing train/test split from Model-1.")
except NameError:
    # If Model-1 variables don't exist, do a fresh split
    X2_train, X2_test, y2_train, y2_test = train_test_split(
        X2, y2, test_size=0.2, random_state=42
    )
    print("ℹ️ Model-1 split not found — created a new train/test split for Model-2.")


✅ Reused existing train/test split from Model-1.


In [None]:
from sklearn.preprocessing import StandardScaler

# STEP 3: Scaling features for Model-2

scaler2 = StandardScaler()

# Fit on training data only, then transform both train & test
X2_train_scaled = scaler2.fit_transform(X2_train)
X2_test_scaled  = scaler2.transform(X2_test)

print("Scaled training sample (first 5 rows):")
print(pd.DataFrame(X2_train_scaled, columns=features_model2).head())


Scaled training sample (first 5 rows):
   median_income  housing_median_age  total_rooms  population  households  \
0      -0.326196            0.348490     0.222569    0.768276    0.322906   
1      -0.035843            1.618118     0.340293   -0.098901    0.672027   
2       0.144701           -1.952710    -0.342597   -0.449818   -0.430461   
3      -1.017864            0.586545    -0.561490   -0.007434   -0.380587   
4      -0.171488            1.142008    -0.119565   -0.485877   -0.314962   

   longitude  latitude  
0   1.272587 -1.372811  
1   0.709162 -0.876696  
2  -0.447603 -0.460146  
3   1.232698 -1.382172  
4  -0.108551  0.532084  


In [None]:
from sklearn.linear_model import LinearRegression

# STEP 4: Train Linear Regression for Model-2

lr2 = LinearRegression()              # create model object
lr2.fit(X2_train_scaled, y2_train)    # train on scaled training data

# Get intercept and coefficients
intercept2 = lr2.intercept_
coeffs2 = lr2.coef_

print("MODEL-2 Intercept:", intercept2)
print("MODEL-2 Coefficients:")
for f, c in zip(features_model2, coeffs2):
    print(f"  {f}: {c:.6f}")


MODEL-2 Intercept: 207194.69373788778
MODEL-2 Coefficients:
  median_income: 73530.710688
  housing_median_age: 14480.021271
  total_rooms: -3865.090667
  population: -48954.446516
  households: 57163.917499
  longitude: -83894.850115
  latitude: -90006.277442


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# STEP 5: Evaluate Model-2

y2_pred = lr2.predict(X2_test_scaled)

r2_m2   = r2_score(y2_test, y2_pred)
rmse_m2 = np.sqrt(mean_squared_error(y2_test, y2_pred))
mae_m2  = mean_absolute_error(y2_test, y2_pred)

print("\n===== MODEL-2 (7 features) RESULTS =====")
print(f"R² Score : {r2_m2:.6f}")
print(f"RMSE     : {rmse_m2:.2f}")
print(f"MAE      : {mae_m2:.2f}")



===== MODEL-2 (7 features) RESULTS =====
R² Score : 0.620518
RMSE     : 70517.83
MAE      : 51657.47


In [None]:
# STEP 6: Create a clean coefficient table for Model-2

coeffs2_df = pd.DataFrame({
    "Feature": features_model2,
    "Coefficient (Standardized)": coeffs2
})

print("\nMODEL-2 Coefficients Table:")
print(coeffs2_df)



MODEL-2 Coefficients Table:
              Feature  Coefficient (Standardized)
0       median_income                73530.710688
1  housing_median_age                14480.021271
2         total_rooms                -3865.090667
3          population               -48954.446516
4          households                57163.917499
5           longitude               -83894.850115
6            latitude               -90006.277442


In [None]:
# STEP 8: Example prediction using Model-2 for a new data point

# Values must follow the SAME ORDER as features_model2
new_house_m2 = [
    8.5,    # median_income
    25.0,   # housing_median_age
    2000,   # total_rooms
    800,    # population
    300,    # households
    -122.25,# longitude
    37.85   # latitude
]

# Convert to DataFrame
new_df_m2 = pd.DataFrame([new_house_m2], columns=features_model2)

# Scale with Model-2 scaler
new_scaled_m2 = scaler2.transform(new_df_m2)

# Predict
pred_price_m2 = lr2.predict(new_scaled_m2)[0]
print("\nPredicted median house value (Model-2) for new sample:", round(pred_price_m2, 2))



Predicted median house value (Model-2) for new sample: 398147.66


comparision

In [None]:
import pandas as pd

# ==========================
# MODEL COMPARISON: M1 vs M2
# ==========================

# Model-1: 5 features (from your earlier code)
# r2  -> R² score for Model-1
# rmse -> RMSE for Model-1
# mae  -> MAE for Model-1

# Model-2: 7 features (from your Model-2 code)
# r2_m2   -> R² score for Model-2
# rmse_m2 -> RMSE for Model-2
# mae_m2  -> MAE for Model-2

# 1) Create a comparison table
comparison_df = pd.DataFrame({
    "Model": ["Model-1 (5 features)", "Model-2 (7 features)"],
    "No_of_features": [5, 7],
    "R2_Score": [r2, r2_m2],
    "RMSE": [rmse, rmse_m2],
    "MAE": [mae, mae_m2]
})

# 2) Print table
print("===== MODEL COMPARISON (Linear Regression) =====")
print(comparison_df.to_string(index=False))

# 3) Find best model based on highest R²
best_idx = comparison_df["R2_Score"].idxmax()
best_model_row = comparison_df.loc[best_idx]

print("\n✅ Best model based on R²:")
print(f"Model          : {best_model_row['Model']}")
print(f"No. of features: {int(best_model_row['No_of_features'])}")
print(f"R² Score       : {best_model_row['R2_Score']:.6f}")
print(f"RMSE           : {best_model_row['RMSE']:.2f}")
print(f"MAE            : {best_model_row['MAE']:.2f}")


===== MODEL COMPARISON (Linear Regression) =====
               Model  No_of_features  R2_Score         RMSE          MAE
Model-1 (5 features)               5  0.549001 76876.056381 56722.434316
Model-2 (7 features)               7  0.620518 70517.833856 51657.465162

✅ Best model based on R²:
Model          : Model-2 (7 features)
No. of features: 7
R² Score       : 0.620518
RMSE           : 70517.83
MAE            : 51657.47


ANN

In [None]:
# ========= STEP 1: Imports =========

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# TensorFlow / Keras for ANN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# ========= STEP 2: Load and prepare data for ANN =========

# Load dataset (change path if needed)
df = pd.read_csv("housing.csv")

# Handle missing values for total_bedrooms (if not already done)
df["total_bedrooms"].fillna(df["total_bedrooms"].median(), inplace=True)

# 7 independent variables (same as Model-2)
features_ann = [
    "median_income",
    "housing_median_age",
    "total_rooms",
    "population",
    "households",
    "longitude",
    "latitude"
]

target = "median_house_value"

# X = features, y = target
X = df[features_ann].copy()
y = df[target].copy()

print("Features used in ANN:", X.columns.tolist())
print("X shape:", X.shape, " y shape:", y.shape)


Features used in ANN: ['median_income', 'housing_median_age', 'total_rooms', 'population', 'households', 'longitude', 'latitude']
X shape: (20640, 7)  y shape: (20640,)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total_bedrooms"].fillna(df["total_bedrooms"].median(), inplace=True)


In [None]:
# ========= STEP 3: Train–Test split =========

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,       # 80% train, 20% test
    random_state=42      # reproducible split
)

print("Train shape :", X_train.shape)
print("Test shape  :", X_test.shape)


Train shape : (16512, 7)
Test shape  : (4128, 7)


In [None]:
# ========= STEP 4: Feature scaling =========

scaler = StandardScaler()

# Fit on training data, transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("Scaled train sample (first 5 rows):")
print(pd.DataFrame(X_train_scaled, columns=features_ann).head())


Scaled train sample (first 5 rows):
   median_income  housing_median_age  total_rooms  population  households  \
0      -0.326196            0.348490     0.222569    0.768276    0.322906   
1      -0.035843            1.618118     0.340293   -0.098901    0.672027   
2       0.144701           -1.952710    -0.342597   -0.449818   -0.430461   
3      -1.017864            0.586545    -0.561490   -0.007434   -0.380587   
4      -0.171488            1.142008    -0.119565   -0.485877   -0.314962   

   longitude  latitude  
0   1.272587 -1.372811  
1   0.709162 -0.876696  
2  -0.447603 -0.460146  
3   1.232698 -1.382172  
4  -0.108551  0.532084  


In [None]:
# ========= STEP 5: Build ANN model =========

model = Sequential()

# Input + first hidden layer
model.add(Dense(64, activation="relu", input_shape=(X_train_scaled.shape[1],)))
# Second hidden layer
model.add(Dense(32, activation="relu"))
# Output layer (1 neuron for regression)
model.add(Dense(1, activation="linear"))  # linear is default for regression

# Show model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# ========= STEP 6: Compile ANN =========

model.compile(
    optimizer="adam",           # popular adaptive optimizer
    loss="mse",                 # minimize Mean Squared Error
    metrics=["mae"]             # also track Mean Absolute Error
)


In [None]:
# ========= STEP 7: Train ANN =========

# EarlyStopping: stop if validation loss doesn't improve for 10 epochs
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,   # 20% of train used for validation
    epochs=100,             # max epochs
    batch_size=32,
    callbacks=[early_stop],
    verbose=1               # 1 = progress bar per epoch
)


Epoch 1/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 55984652288.0000 - mae: 206744.1094 - val_loss: 56492437504.0000 - val_mae: 207098.7656
Epoch 2/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 55067152384.0000 - mae: 204625.9688 - val_loss: 53819887616.0000 - val_mae: 201341.0938
Epoch 3/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 51791486976.0000 - mae: 198152.7188 - val_loss: 47767207936.0000 - val_mae: 187619.6719
Epoch 4/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 44398518272.0000 - mae: 180402.5781 - val_loss: 38906052608.0000 - val_mae: 165625.3750
Epoch 5/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 36201328640.0000 - mae: 158688.5469 - val_loss: 29122105344.0000 - val_mae: 138350.8594
Epoch 6/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/ste

In [None]:
# ========= STEP 8: Evaluate ANN on test set =========

# Predict on test set
y_pred_ann = model.predict(X_test_scaled).flatten()  # flatten to 1D

# Metrics
r2_ann   = r2_score(y_test, y_pred_ann)
rmse_ann = np.sqrt(mean_squared_error(y_test, y_pred_ann))
mae_ann  = mean_absolute_error(y_test, y_pred_ann)

print("\n===== ANN MODEL (7 features) RESULTS =====")
print(f"R² Score : {r2_ann:.6f}")
print(f"RMSE     : {rmse_ann:.2f}")
print(f"MAE      : {mae_ann:.2f}")


[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

===== ANN MODEL (7 features) RESULTS =====
R² Score : 0.660377
RMSE     : 66711.71
MAE      : 47797.52
