## # Machine Learning - Decision Tree Regressor

### Step-01: Import all the libraries required to perform Decision Tree Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

### Step-02: Import fetch_california_housing from sklearn.datasets 

In [2]:
from sklearn.datasets import fetch_california_housing

In [3]:
california = fetch_california_housing()

In [4]:
california.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [5]:
print(california.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [6]:
x = pd.DataFrame(california.data, columns = california.feature_names)
y = california.target

In [7]:
x.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [9]:
print("x.shape: {}" .format(x.shape))
print("y.shape: {}" .format(y.shape))

x.shape: (20640, 8)
y.shape: (20640,)


### Step-03: Import train_test_split import sklearn.model_selection

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [12]:
x_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
3132,2.875,15.0,5.891892,1.124324,960.0,2.594595,35.16,-117.99
3887,2.9389,18.0,4.308966,1.097931,1855.0,2.558621,34.21,-118.53
14898,2.3929,30.0,3.0,1.083333,220.0,4.583333,32.59,-117.08
12444,1.2656,17.0,6.466667,1.6,29.0,1.933333,33.92,-114.67
3504,4.8359,35.0,5.473016,0.987302,992.0,3.149206,34.26,-118.45


In [13]:
x_test.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
7521,1.9038,39.0,3.777344,1.0,903.0,3.527344,33.91,-118.26
15675,4.1047,52.0,3.935484,1.052109,1259.0,1.562035,37.8,-122.43
15893,2.9063,52.0,6.590444,1.122867,1025.0,3.498294,37.73,-122.39
18071,9.1569,22.0,7.252669,0.925267,773.0,2.75089,37.28,-122.01
12842,2.5062,50.0,5.945946,1.013514,1148.0,7.756757,38.66,-121.4


In [14]:
y_train

array([0.878, 2.132, 1.344, ..., 1.361, 1.905, 1.663])

In [15]:
y_test

array([0.931, 4.667, 1.92 , ..., 2.399, 2.318, 2.018])

In [16]:
print("x_train.shape: {}" .format(x_train.shape))
print("x_test.shape: {}" .format(x_test.shape))
print("y_train.shape: {}" .format(y_train.shape))
print("y_test.shape: {}" .format(y_test.shape))

x_train.shape: (16512, 8)
x_test.shape: (4128, 8)
y_train.shape: (16512,)
y_test.shape: (4128,)


### Step-04: Import DecisionTreeRegressor from sklearn.tree

In [17]:
from sklearn.tree import DecisionTreeRegressor

In [18]:
decision_tree_regressor = DecisionTreeRegressor()

In [19]:
decision_tree_regressor.fit(x_train, y_train)

In [20]:
y_pred = decision_tree_regressor.predict(x_test)

In [21]:
y_pred

array([0.986  , 5.00001, 1.875  , ..., 2.455  , 2.182  , 2.129  ])

### Step-05: Import r2_score from sklearn.metrics

In [22]:
from sklearn.metrics import r2_score

In [23]:
print("R Squared Score: {}" .format(r2_score(y_test, y_pred)))

R Squared Score: 0.6159802388423319


### Step-06: Hyperparameter Tuning

In [33]:
from sklearn.model_selection import GridSearchCV

In [35]:
parameters = {
    
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "splitter": ["best", "random"],
    "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    "max_features": ["sqrt", "log2", "None"]
}

In [36]:
grid_search_cv = GridSearchCV(decision_tree_regressor, param_grid = parameters, cv = 2, scoring = "neg_mean_squared_error")

In [37]:
grid_search_cv.fit(x_train, y_train)

In [38]:
grid_search_cv.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 10,
 'max_features': 'log2',
 'splitter': 'best'}

In [41]:
y_pred = grid_search_cv.predict(x_test)

In [42]:
y_pred

array([1.22352982, 4.99519688, 2.2664    , ..., 2.51225081, 2.16289655,
       1.59714286])

In [43]:
print("R Squared Score: {}" .format(r2_score(y_test, y_pred)))

R Squared Score: 0.6612669859914158
