# Import Libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score




In [2]:
data=pd.read_csv("housing_price_dataset.csv")
data.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [3]:
df=data.sample(frac=0.25)

In [4]:
df.shape

(12500, 6)

# Data Preprocessing

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12500 entries, 21282 to 15536
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    12500 non-null  int64  
 1   Bedrooms      12500 non-null  int64  
 2   Bathrooms     12500 non-null  int64  
 3   Neighborhood  12500 non-null  object 
 4   YearBuilt     12500 non-null  int64  
 5   Price         12500 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 683.6+ KB


In [6]:
df.shape

(12500, 6)

In [7]:
df.describe()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
count,12500.0,12500.0,12500.0,12500.0,12500.0
mean,2012.11168,3.50688,2.0036,1985.42408,225471.396133
std,572.508452,1.117557,0.816603,20.824199,75573.812836
min,1000.0,2.0,1.0,1950.0,-13692.026068
25%,1523.0,3.0,1.0,1967.0,170787.659789
50%,2013.0,4.0,2.0,1985.0,226276.20934
75%,2509.0,5.0,3.0,2004.0,280714.929675
max,2999.0,5.0,3.0,2021.0,468470.378004


In [8]:
df.corr()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
SquareFeet,1.0,0.011139,-0.012251,0.003838,0.746506
Bedrooms,0.011139,1.0,0.009573,0.003362,0.079511
Bathrooms,-0.012251,0.009573,1.0,0.010167,0.02063
YearBuilt,0.003838,0.003362,0.010167,1.0,-0.006147
Price,0.746506,0.079511,0.02063,-0.006147,1.0


In [9]:
df.isnull().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

# Decision tree model 

In [10]:
X=df.drop(["Neighborhood","Price"],axis=1)
y=df["Price"]

In [11]:
print(X)


       SquareFeet  Bedrooms  Bathrooms  YearBuilt
21282        2041         4          1       1959
48368        2448         5          1       2015
34006        1741         5          2       1958
1891         1186         3          1       1954
41444        1739         5          1       1967
...           ...       ...        ...        ...
15704        1898         2          2       2009
29116        2548         2          1       1986
41432        1348         3          2       1998
24058        1427         5          3       1982
15536        2793         2          2       1958

[12500 rows x 4 columns]


In [12]:
print(y)

21282    165731.796806
48368    294736.914738
34006    267705.154387
1891     191803.334801
41444    191763.163920
             ...      
15704    211937.807206
29116    232176.150091
41432    120036.806871
24058    159194.627550
15536    216293.384429
Name: Price, Length: 12500, dtype: float64


In [13]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
regressor=DecisionTreeRegressor()
regressor.fit(X_train,y_train)

DecisionTreeRegressor()

# Evalution of baseline model 

In [15]:
y_pred=regressor.predict(X_test)

In [16]:
print(y_pred)

[ 88472.7555982  224632.13152961 252142.73685628 ...  97808.59675103
 200186.42656164 390061.90285802]


In [17]:
score=r2_score(y_pred,y_test)
print(f"Baseline R2: {score}")

Baseline R2: 0.11129317306839315


# Hyperparameter tuning

In [18]:
# Define the hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}

In [19]:
# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=DecisionTreeRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=2,
                           scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 10, 20, 30, 40, 50],
                         'max_features': [None, 'auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             scoring='neg_mean_squared_error', verbose=2)

# Evaluate the Tuned Model

In [21]:
# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train the best model
best_model = grid_search.best_estimator_

# Predict and evaluate the tuned model
y_pred_tuned = best_model.predict(X_test)
tuned_r2_score = r2_score(y_test, y_pred_tuned)
print(f"Tuned MSE: {tuned_r2_score:.2f}")


Best Hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}
Tuned MSE: 0.50
