### 1. Classification with Tree:

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import metrics, preprocessing
from sklearn.datasets import load_boston
warnings.filterwarnings(action='ignore')                  # Turn off the warnings.
%matplotlib inline

#### 1.1. Read in data:

In [None]:
# Go to the directory where the data file is located. 
# os.chdir(r'~~')                # Please, replace the path with your own.   

In [12]:
# Read the already pre-processed data.
df = pd.read_csv('data_titanic_2.csv', header='infer')

In [13]:
df.shape

(889, 21)

In [14]:
df.head(4)

Unnamed: 0,Embarked_Q,Embarked_S,Sex_male,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,SibSp_1,...,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Pclass_2,Pclass_3,"Age_(21.0, 30.0]","Age_(30.0, 35.0]","Age_(35.0, 80.0]",Survived
0,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1


In [16]:
X = df.drop(columns=['Survived'])
Y = df.Survived

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)
model = DecisionTreeClassifier(random_state=4,max_depth=6,criterion='entropy')
model.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=4)

In [19]:
from sklearn import tree
print(tree.export_text(model))

|--- feature_2 <= 0.50
|   |--- feature_16 <= 0.50
|   |   |--- feature_9 <= 0.50
|   |   |   |--- feature_17 <= 0.50
|   |   |   |   |--- class: 1
|   |   |   |--- feature_17 >  0.50
|   |   |   |   |--- feature_15 <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_15 >  0.50
|   |   |   |   |   |--- feature_1 <= 0.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- feature_1 >  0.50
|   |   |   |   |   |   |--- class: 1
|   |   |--- feature_9 >  0.50
|   |   |   |--- feature_1 <= 0.50
|   |   |   |   |--- class: 1
|   |   |   |--- feature_1 >  0.50
|   |   |   |   |--- feature_17 <= 0.50
|   |   |   |   |   |--- feature_3 <= 0.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- feature_3 >  0.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_17 >  0.50
|   |   |   |   |   |--- feature_3 <= 0.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- feature_3 >  0.50
|   |   |   |   |   |   |--- class: 0
|   |

#### 1.2. Tree hyperparameter optimization:

In [37]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix 

In [43]:

param_grid = { 'criterion':['gini','entropy'],'max_depth': np.arange(3, 15)}
# decision tree model
dtree_model=DecisionTreeClassifier()
#use gridsearch to test all values
dtree_gscv = GridSearchCV(dtree_model, param_grid, cv=5)
#fit model to data
dtree_gscv.fit(x_train, y_train)
print(dtree_gscv.best_params_)


{'criterion': 'gini', 'max_depth': 3}


### 2. Regression with Tree:

#### 2.1. Read in data: 

In [22]:
data = pd.read_csv("boston_test.csv")
data

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
1,6,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,496,0.17899,0.0,9.69,0,0.585,5.670,28.8,2.7986,6,391,19.2,393.29,17.60
169,497,0.28960,0.0,9.69,0,0.585,5.390,72.9,2.7986,6,391,19.2,396.90,21.14
170,499,0.23912,0.0,9.69,0,0.585,6.019,65.3,2.4091,6,391,19.2,396.90,12.92
171,501,0.22438,0.0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33


In [25]:
data.shape

(173, 14)

In [24]:
# Display the description on the data.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       173 non-null    int64  
 1   crim     173 non-null    float64
 2   zn       173 non-null    float64
 3   indus    173 non-null    float64
 4   chas     173 non-null    int64  
 5   nox      173 non-null    float64
 6   rm       173 non-null    float64
 7   age      173 non-null    float64
 8   dis      173 non-null    float64
 9   rad      173 non-null    int64  
 10  tax      173 non-null    int64  
 11  ptratio  173 non-null    float64
 12  black    173 non-null    float64
 13  lstat    173 non-null    float64
dtypes: float64(10), int64(4)
memory usage: 19.0 KB


In [27]:
data.describe()

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,258.404624,4.100862,12.66185,10.835145,0.086705,0.549981,6.321237,69.245665,3.958865,9.387283,406.231214,18.469942,351.299711,12.917977
std,143.289788,10.607761,24.536277,6.596488,0.282219,0.117826,0.700621,28.248244,2.324131,8.662621,164.480626,2.196196,99.781464,7.293408
min,3.0,0.01381,0.0,0.46,0.0,0.392,4.138,2.9,1.1781,1.0,187.0,12.6,0.32,1.92
25%,136.0,0.08221,0.0,5.32,0.0,0.447,5.895,42.8,2.0107,4.0,279.0,17.0,371.72,6.87
50%,268.0,0.25199,0.0,8.56,0.0,0.538,6.223,79.2,3.4211,5.0,330.0,19.1,390.07,12.12
75%,381.0,3.67367,20.0,18.1,0.0,0.624,6.674,94.6,5.4007,24.0,666.0,20.2,396.06,17.21
max,505.0,88.9762,95.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,34.37


In [29]:
data.isnull().sum()

ID         0
crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
black      0
lstat      0
dtype: int64

In [30]:
# The explanatory variables.
X=data.iloc[:,0:-1]
Y=data.iloc[:,-1]

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

#### 2.2. Tree hyperparameter optimization:

In [32]:
regressor = DecisionTreeRegressor(max_depth=6)
DT_reg=regressor.fit(X_train, Y_train)

In [33]:
DecisionTreeRegressor(max_depth=6)

DecisionTreeRegressor(max_depth=6)

NOTE: We can compare the above result with that obtained using linear regression where the RMSE was 5.33.

In [34]:
print(DT_reg.score(X_train,Y_train))
print(DT_reg.score(X_test,Y_test))

0.9321758169058988
0.5313874807460731


In [35]:
Y_pred=DT_reg.predict(X_test)


In [49]:
from sklearn.metrics import make_scorer, r2_score


scoring = make_scorer(r2_score)
g_cv = GridSearchCV(DecisionTreeRegressor(random_state=0),
              param_grid={'min_samples_split': range(2, 10)},
              scoring=scoring, cv=5, refit=True)

g_cv.fit(X_train, Y_train)
g_cv.best_params_



{'min_samples_split': 9}

In [50]:
result = g_cv.cv_results_
# print(result)
r2_score(Y_test, g_cv.best_estimator_.predict(X_test))

0.3906608572776046

Hint : 

https://www.youtube.com/watch?v=KzIQ3G_TEFg

https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee


https://www.mygreatlearning.com/blog/gridsearchcv/