In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [5]:
from sklearn.datasets import fetch_california_housing

In [13]:
data = fetch_california_housing()

In [17]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [21]:
df = pd.DataFrame(data.data, columns= data.feature_names)

In [25]:
df['Price'] = data.target

In [26]:
print(df)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  Target  Price  
0    

In [27]:
df.drop('Target', axis = 1, inplace= True)

In [28]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [30]:
df.sample(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
9386,8.4888,30.0,5.954114,1.134494,1292.0,2.044304,37.9,-122.45,5.00001
17437,3.2167,17.0,5.473214,1.236607,484.0,2.160714,34.64,-120.45,1.125
16893,8.4704,52.0,7.012346,0.930041,671.0,2.761317,37.58,-122.38,5.00001
15164,3.7784,25.0,5.928571,1.037267,1043.0,3.23913,32.95,-117.02,1.601
2452,2.5033,9.0,5.012987,1.050649,2157.0,2.801299,36.51,-119.56,0.701
12828,3.9853,24.0,6.08169,1.039437,1141.0,3.214085,38.7,-121.45,0.904
13241,6.075,19.0,7.848723,1.088409,1523.0,2.992141,34.13,-117.66,2.541
853,4.6786,16.0,5.350061,1.088127,2530.0,3.096695,37.59,-122.03,2.56
20326,6.907,16.0,6.64454,1.03212,1567.0,3.35546,34.3,-118.96,5.00001
8576,5.9596,43.0,4.709459,1.084459,552.0,1.864865,33.9,-118.42,5.00001


In [35]:
#Seperate Independent Variable and Dependent Variable

In [37]:
X = df.iloc[:, :-1]
Y= df.iloc[:, -1]

In [38]:
print(X.shape)
print(Y.shape)

(20640, 8)
(20640,)


In [39]:
from sklearn.model_selection import train_test_split

In [48]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state= 1)

In [49]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)


(16512, 8)
(4128, 8)
(16512,)
(4128,)


In [51]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train,Y_train)

In [52]:
y_pred = model.predict(X_test)

In [54]:
print(y_pred)

[4.429 0.683 2.301 ... 1.667 2.707 0.983]


In [56]:
from sklearn.metrics import r2_score
r2_score(y_pred, Y_test)

0.6035326989144342

In [58]:
#Hyperparameter Tuning

In [72]:
params = {
    'criterion' : ['squared_error','absolute_error', 'friedman_mse'],
    'splitter': ['best', 'random'],
    'max_depth' : [1,2,3,4,5],
    'max_features' : ['auto', 'sqrt', 'log2']
}

In [73]:
from sklearn.model_selection import GridSearchCV

In [74]:
regressior = DecisionTreeRegressor()


In [75]:
model1  = GridSearchCV(regressior, param_grid= params, cv = 2, scoring= 'neg_mean_squared_error', verbose = 2) 

In [76]:
model1.fit(X_train, Y_train)

Fitting 2 folds for each of 90 candidates, totalling 180 fits
[CV] END criterion=squared_error, max_depth=1, max_features=auto, splitter=best; total time=   0.0s
[CV] END criterion=squared_error, max_depth=1, max_features=auto, splitter=best; total time=   0.0s
[CV] END criterion=squared_error, max_depth=1, max_features=auto, splitter=random; total time=   0.0s
[CV] END criterion=squared_error, max_depth=1, max_features=auto, splitter=random; total time=   0.0s
[CV] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=best; total time=   0.0s
[CV] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=best; total time=   0.0s
[CV] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=random; total time=   0.0s
[CV] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=random; total time=   0.0s
[CV] END criterion=squared_error, max_depth=1, max_features=log2, splitter=best; total time=   0.0s
[CV] END criterion=squared_err

In [77]:
model1.best_params_

{'criterion': 'absolute_error',
 'max_depth': 5,
 'max_features': 'log2',
 'splitter': 'best'}

In [88]:
from sklearn import tree

In [89]:
plt.figure(figsize= (10, 8))
tree.plot_tree(model1, filled= True)
plt.show()

InvalidParameterError: The 'decision_tree' parameter of plot_tree must be an instance of 'sklearn.tree._classes.DecisionTreeClassifier' or an instance of 'sklearn.tree._classes.DecisionTreeRegressor'. Got GridSearchCV(cv=2, estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'friedman_mse'],
                         'max_depth': [1, 2, 3, 4, 5],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             scoring='neg_mean_squared_error', verbose=2) instead.

<Figure size 1000x800 with 0 Axes>