## 作業

1. 試著調整 DecisionTreeClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型的結果進行比較

In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

#%matplotlib inline

### Boston dataset

#### Load and examine the data

In [2]:
# 讀取 Boston 資料集
dataset = datasets.load_boston()

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


#### Linear regression - baseline

In [4]:
# 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.1, random_state=0)

# 建立一個線性回歸模型
rm = linear_model.LinearRegression()

# 將訓練資料丟進去模型訓練
rm.fit(X_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = rm.predict(X_test)

In [5]:
# 可以看回歸模型的參數值
print('Coefficients: ', rm.coef_)

#plt.scatter(range(len(rm.coef_)), rm.coef_, marker='o', color='royalblue', label='Linear Regression')
#plt.xticks(range(len(dataset.feature_names)), dataset.feature_names, rotation=45)
#plt.xlabel('Feature')
#plt.ylabel('Coefficient')

Coefficients:  [-1.14644795e-01  3.62004052e-02  6.53873262e-03  2.19924733e+00
 -1.59109961e+01  4.26798929e+00 -1.01602089e-02 -1.34698690e+00
  2.71154731e-01 -1.16326045e-02 -1.01714981e+00  9.81293722e-03
 -4.43797298e-01]


In [6]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 41.72


#### Decision tree regression

In [7]:
# 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.1, random_state=0)

# 建立模型
rm = DecisionTreeRegressor()

# 將訓練資料丟進去模型訓練
rm.fit(X_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = rm.predict(X_test)

In [8]:
# 可以看回歸模型的參數值
len = dataset.feature_names.size
feature_df = pd.DataFrame(rm.feature_importances_.reshape(1,len), columns=dataset.feature_names)
feature_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.012868,0.001216,0.004678,0.000858,0.062257,0.578356,0.011971,0.020699,0.000621,0.025404,0.031045,0.01125,0.23878


In [9]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 31.18


### Wine dataset

#### Load and examine the data

In [10]:
# 讀取 wine 資料集
dataset = datasets.load_wine()

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
alcohol                         178 non-null float64
malic_acid                      178 non-null float64
ash                             178 non-null float64
alcalinity_of_ash               178 non-null float64
magnesium                       178 non-null float64
total_phenols                   178 non-null float64
flavanoids                      178 non-null float64
nonflavanoid_phenols            178 non-null float64
proanthocyanins                 178 non-null float64
color_intensity                 178 non-null float64
hue                             178 non-null float64
od280/od315_of_diluted_wines    178 non-null float64
proline                         178 non-null float64
dtypes: float64(13)
memory usage: 18.2 KB


In [12]:
print(f"Unique target values in the dataset = {np.unique(dataset.target)}")
np.bincount(dataset.target)

Unique target values in the dataset = [0 1 2]


array([59, 71, 48])

#### Logistics regression - baseline

In [13]:
# 切分訓練集/測試集
Ｘ_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.1, random_state=3)

# 建立模型
rm = linear_model.LogisticRegression()

# 訓練模型
rm.fit(X_train, y_train)

# 預測測試集
y_pred = rm.predict(X_test)



In [14]:
# 可以看回歸模型的參數值
print('Coefficients: ', rm.coef_)

Coefficients:  [[-5.12014231e-01  6.84580582e-01  1.16616376e+00 -5.97596009e-01
  -2.43211085e-02 -1.07841972e-02  1.32476506e+00  8.66053349e-02
  -2.91220201e-01 -1.70682078e-01 -1.97444584e-01  8.05470417e-01
   1.55623997e-02]
 [ 8.81424058e-01 -1.05950703e+00 -7.57852321e-01  2.15995746e-01
   8.52369179e-03  6.76019364e-01  6.91060040e-01  4.11280806e-01
   4.92641306e-01 -1.80041467e+00  9.79871290e-01  1.90281790e-01
  -1.53550620e-02]
 [-3.28356556e-01  5.79499856e-01  5.63602744e-02  1.54213452e-01
   1.56959360e-02 -7.50364336e-01 -1.90666427e+00 -1.24139305e-01
  -7.10866352e-01  1.00409450e+00 -5.01202931e-01 -1.21907768e+00
   1.10345792e-03]]


In [15]:
# 預測值與實際值的差距
acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9444444444444444


#### Decision tree classification

In [16]:
# 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.1, random_state=0)

# 建立模型
rm = DecisionTreeClassifier(criterion='gini')

# 將訓練資料丟進去模型訓練
rm.fit(X_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = rm.predict(X_test)

In [17]:
# 可以看回歸模型的參數值
len = np.asarray(dataset.feature_names).size
feature_df = pd.DataFrame(rm.feature_importances_.reshape(1,len), columns=dataset.feature_names)
feature_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,0.0,0.016621,0.018019,0.0,0.0,0.0,0.405133,0.0,0.0,0.389038,0.018564,0.034236,0.118389


In [18]:
# 預測值與實際值的差距
score = accuracy_score(y_test, y_pred)
print("Accuracy: ", score)

Accuracy:  1.0
