In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.datasets import fetch_california_housing

In [3]:
dataset = fetch_california_housing()

In [4]:
d1 = pd.DataFrame(dataset.data,columns=dataset.feature_names)
d2 = pd.DataFrame(dataset.target,columns=['price'])
df = pd.concat([d1,d2],axis=1)

In [5]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [6]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [7]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [8]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: price, Length: 20640, dtype: float64

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [15]:
parameter = {
    'splitter':('best','random'),
    'max_depth':[2,3,4,5],
    'min_samples_split':[2,3,4,5,6],
    'min_samples_leaf':[1,2,3],
    'max_features':('auto','sqrt','log2')
}

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
from sklearn.tree import DecisionTreeRegressor
obj = GridSearchCV(DecisionTreeRegressor(),param_grid=parameter,cv=10,scoring='neg_mean_squared_error')

In [18]:
obj.fit(x_train,y_train)

In [19]:
obj.best_params_

{'max_depth': 5,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'splitter': 'best'}

In [20]:
treeClassifier = DecisionTreeRegressor(max_depth=5,max_features='auto',min_samples_leaf=2,min_samples_split=5,splitter='best')

In [22]:
treeClassifier.fit(x_train,y_train)

In [23]:
y_pred = treeClassifier.predict(x_test)

In [24]:
y_pred

array([1.33321946, 1.31363306, 3.36939367, ..., 1.67781843, 1.93822755,
       2.24615597])

In [25]:
y_test

20046    0.47700
3024     0.45800
15663    5.00001
20484    2.18600
9814     2.78000
          ...   
15316    1.66100
14772    0.93600
12870    1.07000
13476    1.30700
16123    3.07100
Name: price, Length: 6812, dtype: float64

In [26]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [27]:
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

0.5343628073475464
0.5295647885818756
0.5990048856345183
