In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 붓꽃 데이터를 로딩
iris_data = load_iris()

In [None]:
iris_data.feature_names

In [None]:
iris_data.target_names

In [None]:
# 학습과 테스트 데이터 셋으로 분리
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.2, random_state=11)

In [None]:
# DecisionTreeClassifier 생성
dt_clf = DecisionTreeClassifier(random_state=156)

In [None]:
# DecisionTreeClassifer 학습
dt_clf.fit(X_train, y_train)

In [None]:
!pip install graphviz

In [None]:
from sklearn.tree import export_graphviz

# export_graphviz()의 호출 결과로 out_file로 지정된 tree.dot 파일을 생성함
export_graphviz(dt_clf, out_file="tree.dot", class_names=iris_data.target_names, feature_names=iris_data.feature_names, impurity=True, filled=True)

In [None]:
import graphviz

# 위에서 생성된 tree.dot 파일을 Graphviz 읽어서 시각화
with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
import numpy as np
import pandas as pd

In [None]:
# 피처 이름 파일 읽어오기
feature_name_df = pd.read_csv('features.txt', sep = '\s+', header = None, names = ['index', 'feature_name'], engine = 'python')
feature_name_df

In [None]:
feature_name_df.head()

In [None]:
feature_dup_df = feature_name_df.groupby('feature_name').count()
feature_dup_df

In [None]:
feature_dup_df = feature_name_df.groupby('feature_name').count()
print(feature_dup_df[feature_dup_df['index'] > 1].count())
feature_dup_df[feature_dup_df['index'] > 1]

In [None]:
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('feature_name').cumcount(), columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(feature_name_df.drop(columns='index').reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['feature_name'] = new_feature_name_df[['feature_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) 
                                                                                         if x[1]>0 else x[0],  axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

In [None]:
# 중복된 피처명을 수정하는 get_new_feature_name_df()를 이용, 신규 피처명 DataFrame 생성
new_feature_name_df = get_new_feature_name_df(feature_name_df)
new_feature_dup_df = new_feature_name_df.groupby('feature_name').count()
new_feature_dup_df

In [None]:
print(new_feature_dup_df[new_feature_dup_df['dup_cnt'] > 1].count())
new_feature_dup_df[new_feature_dup_df['dup_cnt'] > 1]

In [None]:
# DataFrame에 피처명을 컬럼으로 부여하기 위해 리스트 객체로 다시 변환
feature_name = new_feature_name_df.iloc[:, 0].values.tolist()
feature_name

In [None]:
X_train = pd.read_csv('X_train.txt', sep='\s+', names = feature_name, engine = 'python')
X_test = pd.read_csv('X_test.txt', sep='\s+', names = feature_name, engine = 'python')
Y_train = pd.read_csv('y_train.txt', sep='\s+', header = None, names = ['action'], engine = 'python')
Y_test = pd.read_csv('y_test.txt' , sep = '\s+', header = None, names = ['action'], engine = 'python')

In [None]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

In [None]:
X_train.head()

In [None]:
print(Y_train['action'].value_counts())

In [None]:
label_name_df = pd.read_csv('activity_labels.txt', sep = '\s+', header = None, names = ['index', 'label'], engine = 'python')

In [None]:
# index 제거하고, feature_name만 리스트로 저장
label_name = label_name_df.iloc[:, 1].values.tolist()
label_name

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# 결정 트리 분류 분석: 모델 생성
dt_HAR = DecisionTreeClassifier(random_state=156)

In [None]:
# 결정 트리 분류 분석: 모델 훈련
dt_HAR.fit(X_train, Y_train)

In [None]:
# 결정 트리 분류 분석: 평가 데이터에 예측 수행 -> 예측 결과로 Y_predict 구하기
Y_predict = dt_HAR.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(Y_test, Y_predict)
print('결정 트리 예측 정확도: {0:.4f}'.format(accuracy))

In [None]:
print('결정 트리의 현재 하이퍼 매개변수: \n', dt_HAR.get_params())

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
 'max_depth' : [6, 8, 10, 12, 16, 20, 24]
}

grid_cv = GridSearchCV(dt_HAR, param_grid = params, scoring = 'accuracy', cv = 5, return_train_score = True)

grid_cv.fit(X_train, Y_train)

In [None]:
cv_results_df = pd.DataFrame(grid_cv.cv_results_)

cv_results_df[['param_max_depth', 'mean_test_score', 'mean_train_score']]

In [None]:
print('최고 평균 정확도: {0:.4f}, 최적 하이퍼 매개변수: {1}'.format(grid_cv.best_score_, grid_cv.best_params_))

In [None]:
params = {
 'max_depth' : [8, 16, 20],
 'min_samples_split' : [8, 16, 24]
}
grid_cv = GridSearchCV(dt_HAR, param_grid = params, scoring = 'accuracy', cv = 5, return_train_score = True)
grid_cv.fit(X_train, Y_train)

In [None]:
cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_results_df[['param_max_depth', 'param_min_samples_split', 'mean_test_score', 'mean_train_score']]

In [None]:
print('최고 평균 정확도: {0:.4f}, 최적 하이퍼 매개변수: {1}'.format(grid_cv.best_score_, grid_cv.best_params_))

In [None]:
best_dt_HAR = grid_cv.best_estimator_
best_Y_predict = best_dt_HAR.predict(X_test)
best_accuracy = accuracy_score(Y_test, best_Y_predict)
print('best 결정 트리 예측 정확도: {0:.4f}'.format(best_accuracy))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
feature_importance_values = best_dt_HAR.feature_importances_
feature_importance_values_s = pd.Series(feature_importance_values, index = X_train.columns)

In [None]:
feature_top10 = feature_importance_values_s.sort_values(ascending = False)[:10]

In [None]:
plt.figure(figsize = (10, 5))
plt.title('Feature Top 10')
sns.barplot(x = feature_top10, y = feature_top10.index)
plt.show()

In [None]:
from sklearn.tree import export_graphviz
#export_graphviz()의 호출 결과로 out_file로 지정된 tree.dot 파일 생성
export_graphviz(best_dt_HAR, out_file = "tree.dot", class_names = label_name, feature_names = feature_name, impurity = True, filled = True)

In [None]:
import graphviz
#위에서 생성된 tree.dot 파일을 Graphviz가 읽어서 시각화
with open("tree.dot") as f:
   dot_graph = f.read()
graphviz.Source(dot_graph)