In [1]:
import pandas as pd, warnings, numpy as np, matplotlib.pyplot as plt, seaborn as sns,graphviz
from sklearn.datasets import load_boston,load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree,export_graphviz
from sklearn.metrics import accuracy_score, classification_report,r2_score,confusion_matrix,plot_confusion_matrix
warnings.filterwarnings('ignore')

# PARAMETERS

In [2]:
criterion = ['squared_error','friedman_mse','absolute_error','poisson']
splitter = ['best','random']
max_depth = [1,2,3,4,5,6,7,8,10,11,12]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
max_features = ['log2','sqrt','auto']
decision_params = {
    'criterion':criterion,
    'splitter':splitter,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'max_features':max_features,
}

In [3]:
def Evaluate_Regressor(model,X_test,y_test):
    y_pred = model.predict(X_test)
    errors = abs(y_pred - y_test)
    mape = 100 * np.mean(errors/y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [4]:
def R2_score(model,X,y,name_r_score):
    y_pred = model.predict(X)
    print('{}: {:.2%}'.format(name_r_score,r2_score(y,y_pred)))

# DATA

In [5]:
boston_df = load_boston()
X = pd.DataFrame(boston_df.data,columns=boston_df.feature_names)
y = boston_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# BASE  MODEL

In [6]:
base_model = DecisionTreeRegressor()
base_model.fit(X_train,y_train)

# POST PRUNING (chọn độ sâu của cây để tránh overfitting)

In [7]:
post_model = DecisionTreeRegressor(max_depth=2)
post_model.fit(X_train,y_train)

# PREPRUNNING (chọn ccl_alphas: cclphas càng tăng thì nút lá càng giảm)
+ CHOOSE CCL_ALPHAS
+ RANDOMMIZEDSEARCHCV / GRIDSEARCHCV

CHOOSE IDEAL CCL_ALPHAS

In [8]:
model = DecisionTreeRegressor().fit(X_train,y_train)
path = model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas = ccp_alphas[:-1]

In [9]:
alpha_loop_values = []
for ccp_alpha in ccp_alphas:
    final_tree = DecisionTreeRegressor(ccp_alpha=ccp_alpha).fit(X_train, y_train)
    r_scores = cross_val_score(final_tree,X_train,y_train,cv=10,scoring='r2')
    alpha_loop_values.append([ccp_alpha,np.mean(r_scores),np.std(r_scores)])

In [10]:
alpha_results = pd.DataFrame(alpha_loop_values,columns=['Alpha','Mean R2 score','Standard'])
ideal_ccp_alpha = ccp_alphas[alpha_results.sort_values(by='Mean R2 score',ascending=False)[:9]['Standard'].sort_values().index[0]]

In [11]:
# alpha_results.plot(x='Alpha',y='Mean R2 score',yerr='Standard',marker='o',linestyle='--');

RandomizedSearchCV

In [12]:
def Hypertuning_rscv(model,params,n_iter,cv,X_train,y_train):
    random = RandomizedSearchCV(estimator=model, param_distributions=params, n_jobs=-1, n_iter=n_iter, cv=cv,
                                verbose=2, random_state=42)
    random.fit(X_train,y_train)
    best_params = random.best_params_
    best_score = random.best_score_
    best_estimator = random.best_estimator_
    return best_params,best_score,best_estimator

In [13]:
best_params, best_score, best_estimator_random = Hypertuning_rscv(DecisionTreeRegressor(ccp_alpha=ideal_ccp_alpha),decision_params,10,3,X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


# GridSearchCV

In [14]:
def Hypertuning_gscv(model,params,cv,X_train,y_train):
    grid = GridSearchCV(estimator = model, param_grid = params, cv=cv, n_jobs=-1, verbose=2)
    grid.fit(X_train,y_train)
    best_score = grid.best_score_
    best_params = grid.best_params_
    best_estimator = grid.best_estimator_
    return best_params,best_score,best_estimator

In [15]:
best_params, best_score, best_estimator_grid = Hypertuning_gscv(DecisionTreeRegressor(ccp_alpha=ideal_ccp_alpha),decision_params,10,X_train,y_train)

Fitting 10 folds for each of 2376 candidates, totalling 23760 fits


# Evaluation

R2 SCORE

In [23]:
# R2_score(base_model,X_train,y_train,'Train R2 score Base Model')
# R2_score(base_model,X_test,y_test,'Test R2 score Base Model')
# print('')
# R2_score(post_model,X_train,y_train,'Train R2 score Post Model')
# R2_score(post_model,X_test,y_test,'Test R2 score Post Model')
# print('')
# R2_score(best_estimator_random,X_train,y_train,'Train R2 score Randomimzed')
# R2_score(best_estimator_random,X_test,y_test,'Test R2 score Randomized')
# print('')
# R2_score(best_estimator_grid,X_train,y_train,'Train R2 score Grid')
# R2_score(best_estimator_grid,X_test,y_test,'Test R2 scorey Grid')

MEAN ERROR & MAPE

In [22]:
# base_accuracy = Evaluate_Regressor(base_model,X_test,y_test)
# post_model = Evaluate_Regressor(post_model,X_test,y_test)
# random_accuracy_regressor = Evaluate_Regressor(best_estimator_random,X_test,y_test)
# grid_accuracy_regressor = Evaluate_Regressor(best_estimator_grid,X_test,y_test)

In [21]:
# plt.figure(figsize=(100,100))
# plot_tree(best_estimator_random,filled=True,rounded=True,feature_names=X_train.columns);