## **car_evaluation 데이터로 전처리 및 모델학습**

In [1]:
import pandas as pd
import os

# 데이터 불러오기
file_name = 'car_evaluation.csv'
full_file_path = os.path.join(os.getcwd() + "\\" + file_name)

# 데이터의 컬럼이 따로 없기 때문에 header = None을 이용해 임의의 컬럼명 생성
df = pd.read_csv(full_file_path, header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [2]:
# 데이터 정보 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1728 non-null   object
 1   1       1728 non-null   object
 2   2       1728 non-null   object
 3   3       1728 non-null   object
 4   4       1728 non-null   object
 5   5       1728 non-null   object
 6   6       1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [3]:
# 결측치 확인
df.isna().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

In [4]:
# 컬럼 명 변경
df.columns = ['price', 'maint', 'doors', 'person', 'lug_capacity', 'safaty', 'output']
df.head()

Unnamed: 0,price,maint,doors,person,lug_capacity,safaty,output
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [None]:
# 컬럼 확인
df.columns 

Index(['price', 'maint', 'doors', 'person', 'lug_capacity', 'safaty',
       'output'],
      dtype='object')

In [6]:
from sklearn.preprocessing import LabelEncoder

# 인코딩 작업
def encode_features(dataDF) :
    features = ['price', 'maint', 'doors', 'person', 'lug_capacity', 'safaty', 'output']
    for feature in features :
        le = LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])

    return dataDF

df = encode_features(df)
df.head()

Unnamed: 0,price,maint,doors,person,lug_capacity,safaty,output
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [9]:
# 피처, 레이블 데이터
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# 데이터 스케일링
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 훈련, 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1382, 6), (346, 6), (1382,), (346,))

In [None]:
# 모델 생성
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
lr_clf = LogisticRegression(max_iter = 200)
knn_clf = KNeighborsClassifier(n_neighbors = 3)

In [11]:
# 결정트리 학습 / 예측 / 평가
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('결정트리 정확도 {:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('결정트리 오차행렬')
print(confusion_matrix(y_test, dt_pred))

결정트리 정확도 0.9682
결정트리 오차행렬
[[ 76   6   1   0]
 [  1  10   0   0]
 [  0   0 235   0]
 [  1   2   0  14]]


In [None]:
# 랜덤포레스트 학습 / 예측 / 평가
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('랜덤포레스트 정확도 {:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('랜덤포레스트 오차행렬')
print(confusion_matrix(y_test, rf_pred))

결정트리 정확도 0.9711
결정트리 오차행렬
[[ 75   7   0   1]
 [  0  10   0   1]
 [  0   0 235   0]
 [  1   0   0  16]]


In [13]:
# 로지스틱 회귀 학습 / 예측 / 평가
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print('로지스틱 회귀 정확도 {:.4f}'.format(accuracy_score(y_test, lr_pred)))
print('로지스틱 회귀 오차행렬')
print(confusion_matrix(y_test, lr_pred))

로지스틱 회귀 정확도 0.6618
로지스틱 회귀 오차행렬
[[ 12   0  68   3]
 [  2   0   9   0]
 [ 17   0 217   1]
 [ 12   0   5   0]]


In [15]:
# KNN 학습 / 예측 / 평가
knn_clf.fit(X_train, y_train)
knn_pred = knn_clf.predict(X_test)
print('KNN 정확도 {:.4f}'.format(accuracy_score(y_test, knn_pred)))
print('KNN 오차행렬')
print(confusion_matrix(y_test, knn_pred))

KNN 정확도 0.9191
KNN 오차행렬
[[ 73   4   5   1]
 [  4   7   0   0]
 [ 12   0 223   0]
 [  2   0   0  15]]
