In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

### (Data) 자동차 평가 데이터 베이스

- 자동차 구매 가격
- 유지보수 비용
- 문 개수
- 탑승 인원
- 트렁크 크기
- 안정성
- 자동차 평가 등급

In [None]:
df = pd.read_csv('./data/car_evaluation.csv', header=None)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = col_names

In [None]:
df.info()

In [None]:
for col in col_names:
    
    print(df[col].value_counts())  

In [None]:
df.isnull().sum()

In [None]:
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'])
df_encoder = encoder.fit_transform(df)

In [None]:
df_encoder.head()

In [None]:
X = df_encoder.drop(['class'], axis=1)
y = df_encoder['class']

In [None]:
# split data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_train.shape, X_test.shape

In [None]:
X_test.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# instantiate the classifier with n_estimators = 100
rfc = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=5)   

# fit the model to the training set
rfc.fit(X_train, y_train)

# Predict on the test set results
y_pred = rfc.predict(X_test)

# Check accuracy score 
print('Model accuracy score with decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
y_test.info()

#### 시각화

In [None]:
from sklearn.decomposition import PCA
from mlxtend.plotting import plot_decision_regions

# PCA로 2차원 축소
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_test)
model_pca = RandomForestClassifier(n_estimators=100, random_state=0,max_depth=5)
model_pca.fit(X_pca, y_test)

# 시각화
plot_decision_regions(X_pca, y_test.to_numpy(), clf=model_pca, legend=2)
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title("Decision Regions (RandomForestClassifier)")
plt.show()

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(rfc.estimators_[0], filled=True)
plt.title("Tree from Random Forest")
plt.show()


In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(clf.estimators_[0], feature_names=X_test.columns,
            filled=True)
plt.title("Tree from Random Forest")
plt.show()

#### Feature Scores

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model to the training set

clf.fit(X_train, y_train)

In [None]:
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

feature_scores

#### Visualization feature scores

In [None]:
sns.barplot(x=feature_scores, y=feature_scores.index)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()

#### Confusion matrix

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

#### Classifier Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

#### (실습) Select Feature Analysis
- Feature Importances 에서 상위권 5개 Feature를 활용한 선택적 분석