# **Predicting Median Prices With Regression Trees**


In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

## Read the Data


In [2]:
data = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/real_estate_data.csv")

In [3]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,,36.2


In [4]:
data.shape

(506, 13)

In [5]:
data.isna().sum()

Unnamed: 0,0
CRIM,20
ZN,20
INDUS,20
CHAS,20
NOX,0
RM,0
AGE,20
DIS,0
RAD,0
TAX,0


## Data Pre-Processing


In [6]:
data.dropna(inplace=True)

In [7]:
data.isna().sum()

Unnamed: 0,0
CRIM,0
ZN,0
INDUS,0
CHAS,0
NOX,0
RM,0
AGE,0
DIS,0
RAD,0
TAX,0


In [8]:
X = data.drop(columns=["MEDV"])
Y = data["MEDV"]

In [9]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3,222,18.7,5.21


In [10]:
Y.head()

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
5,28.7


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=1)

## Creating Regression Tree


In [14]:
regression_tree = DecisionTreeRegressor(criterion = 'friedman_mse')

## Training


Now lets train our model using the `fit` method on the `DecisionTreeRegressor` object providing our training data


In [15]:
regression_tree.fit(X_train, Y_train)

## Evaluation


In [16]:
regression_tree.score(X_test, Y_test)

0.8392027773934491

In [17]:
prediction = regression_tree.predict(X_test)

print("$",(prediction - Y_test).abs().mean()*1000)

$ 2827.848101265822


Trying with Different criterions

In [32]:
criterions = ['poisson', 'absolute_error', 'squared_error', 'friedman_mse']
score_dict = {}
for criterion in criterions:
  regression_tree = DecisionTreeRegressor(criterion = criterion)
  regression_tree.fit(X_train, Y_train)
  score = regression_tree.score(X_test, Y_test)
  score_dict[criterion] = score
  prediction = regression_tree.predict(X_test)
  print("$",(prediction - Y_test).abs().mean()*1000)
print(score_dict)

$ 2983.5443037974683
$ 2702.5316455696207
$ 2984.810126582279
$ 2767.0886075949365
{'poisson': 0.8241831786656274, 'absolute_error': 0.8713568319369365, 'squared_error': 0.7603786069805204, 'friedman_mse': 0.8514663953555555}
