# Boston House Price Prediction

## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

## Step A: Load and preprocess the dataset

In [2]:
df = pd.read_csv('HousingData.csv')

# Check for missing values
missing_values = df.isnull().sum()

# Drop rows with missing values (or alternatively, could fill them)
df_clean = df.dropna()

# Remove outliers using IQR method
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('MEDV')  # Do not apply outlier removal on target
df_clean = remove_outliers(df_clean, numeric_cols)

## Step B: Model Selection (Decision Tree)
## Step C: Split and Train the model

In [3]:
X = df_clean.drop('MEDV', axis=1)
y = df_clean['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initial Decision Tree model
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [4]:
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,0.17783,0.0,9.69,0.0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10
500,0.22438,0.0,9.69,0.0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64


In [5]:
y

0      24.0
1      21.6
2      34.7
3      33.4
5      28.7
       ... 
499    17.5
500    16.8
502    20.6
503    23.9
504    22.0
Name: MEDV, Length: 181, dtype: float64

In [6]:
X_train.shape

(144, 13)

In [7]:
X_test.shape

(37, 13)

In [8]:
y_train.shape

(144,)

In [9]:
y_test

26     16.6
70     24.2
325    24.6
120    22.0
317    19.8
21     19.6
38     24.7
106    19.5
231    31.7
243    23.7
164    22.7
333    22.2
107    20.4
313    21.6
75     21.4
22     15.2
82     24.8
251    24.8
175    29.4
88     23.6
176    23.2
246    24.3
314    23.8
49     19.4
13     20.4
495    23.1
93     25.0
24     15.6
320    23.8
496    19.7
217    28.7
86     22.5
311    22.1
252    29.6
104    20.1
46     20.0
242    22.2
Name: MEDV, dtype: float64

## Step D: Evaluate the model

In [10]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [11]:
y_pred

array([23.1, 23.9, 24.4, 20.3, 16.2, 20.3, 21.7, 18.8, 31.5, 23.3, 21.4,
       24.4, 19.4, 23. , 27.1, 21. , 22.9, 24.4, 23.3, 28.7, 19. , 21.2,
       22.8, 19.3, 19.9, 17.5, 22.9, 17.4, 25. , 13.6, 27.1, 19. , 20.4,
       26.2, 19.4, 18.7, 20. ])

In [12]:
mse

9.553243243243243

In [13]:
r2

0.23358450009259069

## Step E: Fine-tuning with Grid Search

In [14]:
param_grid = {
    'max_depth': [3, 5, 7, 9, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Evaluate fine-tuned model
best_model = grid_search.best_estimator_

y_pred_tuned = best_model.predict(X_test)

mse_tuned = mean_squared_error(y_test, y_pred_tuned)

r2_tuned = r2_score(y_test, y_pred_tuned)

# Output metrics and best parameters
{
    "Initial Model": {"MSE": mse, "R2": r2},
    "Tuned Model": {"MSE": mse_tuned, "R2": r2_tuned},
    "Best Parameters": grid_search.best_params_
}

{'Initial Model': {'MSE': 9.553243243243243, 'R2': 0.23358450009259069},
 'Tuned Model': {'MSE': 8.633561131055892, 'R2': 0.30736662913724766},
 'Best Parameters': {'max_depth': 3,
  'min_samples_leaf': 2,
  'min_samples_split': 10}}

## Testing on new data

In [15]:
# New sample input (must match the structure of X)
new_instance = pd.DataFrame([{
    'CRIM': 0.1,
    'ZN': 12.5,
    'INDUS': 7.87,
    'CHAS': 0.0,
    'NOX': 0.524,
    'RM': 6.5,
    'AGE': 70.0,
    'DIS': 4.5,
    'RAD': 5,
    'TAX': 300,
    'PTRATIO': 18.0,
    'B': 390.0,
    'LSTAT': 12.0
}])

# Ensure the column order matches training features
new_instance = new_instance[X.columns]

# Predict the house price
predicted_price = best_model.predict(new_instance)

# Display prediction
print(f"Predicted House Price (MEDV): {predicted_price[0]:.2f}")


Predicted House Price (MEDV): 19.42


# Save Model for Deployment

In [16]:
joblib.dump(best_model, "house_model.pkl")
print("✅ Model saved as 'house_model.pkl'")

✅ Model saved as 'house_model.pkl'
