In [6]:
import pandas as pd
from pandas_datareader import data
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [7]:
from sklearn.datasets import fetch_california_housing

# Load dataset
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
df['MedHouseVal'] = california.target

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [9]:
df.shape

(20640, 9)

In [12]:
# Prepare data
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Decision Tree Regressor model
model = DecisionTreeRegressor(criterion='squared_error', max_depth=5)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))

Mean Absolute Error: 0.5222592972077787
Mean Squared Error: 0.5245146178314737
R2 Score: 0.5997321244428705


# hyperparameter tuning

In [13]:
# Define the parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=DecisionTreeRegressor(random_state=42), param_grid=param_grid, cv=5, scoring='r2')

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best R2 score:", grid_search.best_score_)

# Evaluate the model with the best parameters on the test set
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)
print('Mean Absolute Error (Tuned Model):', metrics.mean_absolute_error(y_test, y_pred_tuned))
print('Mean Squared Error (Tuned Model):', metrics.mean_squared_error(y_test, y_pred_tuned))
print('R2 Score (Tuned Model):', r2_score(y_test, y_pred_tuned))

Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best R2 score: 0.7100830403599112
Mean Absolute Error (Tuned Model): 0.4311152226926481
Mean Squared Error (Tuned Model): 0.40840458615620523
R2 Score (Tuned Model): 0.6883380738855668


# Feature Importance


In [15]:
for importance, name in sorted(zip(model.feature_importances_, X_train.columns),reverse=True):
  print (name, importance)

MedInc 0.7712117162048102
AveOccup 0.12840674614895986
HouseAge 0.04162087993607815
AveRooms 0.03126072126800427
Latitude 0.022049480286783087
Population 0.0024849982871781528
Longitude 0.002096950201377061
AveBedrms 0.0008685076668091814
