# IMPLEMENTATION OF DECISION TREE

## DECISISON TREE FOR REGRESSION

### Importing Modules

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the datasets

In [7]:
data = pd.read_csv(r"C:\Users\DJ COMPUTERS\OneDrive\Desktop\Jupyter\House Price Prediction Dataset.csv")
data.head(10)

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056
5,6,3944,1,2,1,1906,Urban,Poor,No,93262
6,7,3671,1,1,2,1948,Rural,Poor,Yes,448722
7,8,3419,2,4,1,1925,Suburban,Good,Yes,594893
8,9,630,2,2,1,1932,Rural,Poor,Yes,652878
9,10,2185,3,3,1,2000,Downtown,Poor,No,340375


### Data Pre-processing

In [10]:
data.isna().sum()   # checking for na values

Id           0
Area         0
Bedrooms     0
Bathrooms    0
Floors       0
YearBuilt    0
Location     0
Condition    0
Garage       0
Price        0
dtype: int64

In [12]:
data.isnull()  # checking for null values

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
1995,False,False,False,False,False,False,False,False,False,False
1996,False,False,False,False,False,False,False,False,False,False
1997,False,False,False,False,False,False,False,False,False,False
1998,False,False,False,False,False,False,False,False,False,False


In [14]:
data.duplicated()   # checking for duplicate records

0       False
1       False
2       False
3       False
4       False
        ...  
1995    False
1996    False
1997    False
1998    False
1999    False
Length: 2000, dtype: bool

In [16]:
data.shape   # dimensions of the data

(2000, 10)

In [18]:
data = data.drop(columns = "Id")


# Data Mining

In [21]:
data.info()   # overview of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Area       2000 non-null   int64 
 1   Bedrooms   2000 non-null   int64 
 2   Bathrooms  2000 non-null   int64 
 3   Floors     2000 non-null   int64 
 4   YearBuilt  2000 non-null   int64 
 5   Location   2000 non-null   object
 6   Condition  2000 non-null   object
 7   Garage     2000 non-null   object
 8   Price      2000 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 140.8+ KB


In [23]:
data["Location"].value_counts()


Location
Downtown    558
Urban       485
Suburban    483
Rural       474
Name: count, dtype: int64

In [25]:
data["Condition"].value_counts()

Condition
Fair         521
Excellent    511
Poor         507
Good         461
Name: count, dtype: int64

In [27]:
data["Garage"].value_counts()

Garage
No     1038
Yes     962
Name: count, dtype: int64

In [29]:
data.describe()   # summary statistics of the data

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Price
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,2786.2095,3.0035,2.5525,1.9935,1961.446,537676.855
std,1295.146799,1.424606,1.10899,0.809188,35.926695,276428.845719
min,501.0,1.0,1.0,1.0,1900.0,50005.0
25%,1653.0,2.0,2.0,1.0,1930.0,300098.0
50%,2833.0,3.0,3.0,2.0,1961.0,539254.0
75%,3887.5,4.0,4.0,3.0,1993.0,780086.0
max,4999.0,5.0,4.0,3.0,2023.0,999656.0


### Feature Engineering

In [35]:
# Label encode categorical columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in data.select_dtypes(include="object").columns:
    data[col] = le.fit_transform(data[col])


In [67]:
# Correlation Analysis

# Correlation with the target variable
correlation = data.corr(numeric_only=True)["Price"].sort_values(ascending=False)
correlation
# Select features with correlation > threshold
threshold = 0.0
important_features = correlation[abs(correlation) > threshold].index.tolist()
# Separate features and target
features_to_scale = [col for col in important_features if col != "Price"]


In [69]:


# Apply MinMax scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_array = scaler.fit_transform(data[features_to_scale])

# Create new scaled DataFrame
scaled_data = pd.DataFrame(scaled_array, columns=features_to_scale)

# Add target column back
scaled_data["Price"] = data["Price"].values

# Final engineered data
scaled_data.head()

Unnamed: 0,Floors,YearBuilt,Garage,Area,Bedrooms,Location,Condition,Bathrooms,Price
0,1.0,0.569106,0.0,0.190974,1.0,0.0,0.0,1.0,149919
1,1.0,0.471545,0.0,0.838373,1.0,0.0,0.0,1.0,424998
2,1.0,0.308943,0.0,0.687194,0.25,0.0,0.666667,0.333333,266746
3,0.5,0.01626,1.0,0.103379,0.75,0.666667,0.333333,0.333333,244020
4,0.5,0.609756,1.0,0.983771,0.0,0.0,0.333333,1.0,636056


In [71]:
data = scaled_data
data.head()   ## final data for model fitting

Unnamed: 0,Floors,YearBuilt,Garage,Area,Bedrooms,Location,Condition,Bathrooms,Price
0,1.0,0.569106,0.0,0.190974,1.0,0.0,0.0,1.0,149919
1,1.0,0.471545,0.0,0.838373,1.0,0.0,0.0,1.0,424998
2,1.0,0.308943,0.0,0.687194,0.25,0.0,0.666667,0.333333,266746
3,0.5,0.01626,1.0,0.103379,0.75,0.666667,0.333333,0.333333,244020
4,0.5,0.609756,1.0,0.983771,0.0,0.0,0.333333,1.0,636056


### Model Training

In [74]:
## Feature and Label Separation

X = data.drop(columns = "Price",axis =1)
Y = data["Price"]

## Train test splitting
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 42)



In [86]:
## fitting the model

from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor(max_depth = None,random_state=42)
reg.fit(X_train,Y_train)

### Model Evaluation

In [91]:
from sklearn.metrics import root_mean_squared_error, r2_score
y_pred = reg.predict(X_test)

# Evaluation
print("Mean Squared Error:", root_mean_squared_error(Y_test, y_pred))
print("R² Score:", r2_score(Y_test, y_pred))

Mean Squared Error: 410219.4376555431
R² Score: -1.1630144082103673
