In [1]:
# 필수 라이브러리
import pandas as pd
import numpy as np
import random
import tensorflow as tf

# 랜덤 시드 고정
SEED = 12
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
print("시드 고정:", SEED)

시드 고정: 12


In [2]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
wine_path = '/gdrive/My Drive/Colab Notebooks/ML-Study/ML-Study/DeepLearning/Python_DeepLearning/wine/'

train = pd.read_csv(wine_path + 'train.csv')
test = pd.read_csv(wine_path + 'test.csv')
submission = pd.read_csv(wine_path + 'sample_submission.csv')

print(train.shape, test.shape, submission.shape)

(5497, 14) (1000, 13) (1000, 2)


In [4]:
train.head(2)

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red


In [5]:
submission.head()

Unnamed: 0,index,quality
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [6]:
train['type'].value_counts()

white    4159
red      1338
Name: type, dtype: int64

In [7]:
# type 데이터를 숫자형 데이터로 변환
train['type'] = np.where(train['type']=='white', 1, 0).astype(int)
test['type'] = np.where(test['type']=='white', 1, 0).astype(int)
train['type'].value_counts()

1    4159
0    1338
Name: type, dtype: int64

In [8]:
train['quality'].value_counts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

In [9]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(train.loc[:, 'quality'] - 3)
y_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [10]:
# 피처 선택
X_train = train.loc[:, 'fixed acidity':]
X_test = test.loc[:, 'fixed acidity':]

# 피처 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

print(X_train_scaled.shape, y_train.shape)
print(X_test_scaled.shape)

(5497, 12) (5497, 7)
(1000, 12)


In [11]:
# 모델 설계 - Dropout 활용
# 배치 단위로 학습할 때 마다 은닉층에서 무작위로 선정된 유닛의 연결을 제거.
# 매번 다른 네트워크 구조의 모델을 사용하는 것과 같으므로 앙상블 모델과 비슷한 효과 발생.
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(train_data, train_target):
  model = Sequential()
  # tanh함수 - -1 ~ +1 사이의 출력. 입력값이 0 근처일때는 학습율이 좋지만, 입력값이 커지거나 작아지면 기울기가 0에 수렴.
  # 따라서 입력값이 극단적이면 학습이 잘 이뤄지지 않는다.
  model.add(Dense(128, activation='tanh', input_dim=train_data.shape[1]))
  model.add(Dropout(0.2))
  model.add(Dense(64, activation='tanh'))
  model.add(Dropout(0.2))
  model.add(Dense(32, activation='tanh'))
  model.add(Dropout(0.2))
  model.add(Dense(train_target.shape[1], activation='softmax'))

  model.compile(optimizer='RMSProp', loss='categorical_crossentropy',
                metrics=['acc', 'mae'])
  
  return model

model = build_model(X_train_scaled, y_train)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1664      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 7)                 2

In [12]:
# Early Stopping 기법 - 홀드아웃으로 검증데이터를 분할하고, 검증 데이터에 대한 모델의 성능이 일정 에포크 동안 좋아지지 않으면 학습종료.
# 과대적합 방지!!
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, shuffle=True, random_state=SEED)

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
history = model.fit(X_tr, y_tr, batch_size=64, epochs=200, validation_data=(X_val, y_val),
                    callbacks=[early_stopping], verbose=2)

Epoch 1/200
73/73 - 2s - loss: 1.4327 - acc: 0.3825 - mae: 0.1969 - val_loss: 1.2414 - val_acc: 0.4570 - val_mae: 0.1887 - 2s/epoch - 33ms/step
Epoch 2/200
73/73 - 0s - loss: 1.3501 - acc: 0.4062 - mae: 0.1906 - val_loss: 1.2199 - val_acc: 0.4255 - val_mae: 0.1894 - 293ms/epoch - 4ms/step
Epoch 3/200
73/73 - 0s - loss: 1.3232 - acc: 0.4174 - mae: 0.1898 - val_loss: 1.1967 - val_acc: 0.4776 - val_mae: 0.1795 - 310ms/epoch - 4ms/step
Epoch 4/200
73/73 - 0s - loss: 1.2862 - acc: 0.4242 - mae: 0.1874 - val_loss: 1.2284 - val_acc: 0.4339 - val_mae: 0.1821 - 298ms/epoch - 4ms/step
Epoch 5/200
73/73 - 0s - loss: 1.2632 - acc: 0.4399 - mae: 0.1848 - val_loss: 1.1668 - val_acc: 0.4727 - val_mae: 0.1783 - 314ms/epoch - 4ms/step
Epoch 6/200
73/73 - 0s - loss: 1.2494 - acc: 0.4358 - mae: 0.1849 - val_loss: 1.1324 - val_acc: 0.4909 - val_mae: 0.1769 - 328ms/epoch - 4ms/step
Epoch 7/200
73/73 - 0s - loss: 1.2299 - acc: 0.4576 - mae: 0.1828 - val_loss: 1.1363 - val_acc: 0.4788 - val_mae: 0.1766 - 346

In [13]:
model.evaluate(X_val, y_val)



[1.0672101974487305, 0.521212100982666, 0.16805066168308258]

In [14]:
# test 데이터에 대한 예측값 정리
y_pred_proba = model.predict(X_test)
y_pred_proba[:5]

array([[9.0349384e-04, 5.4673322e-02, 4.2620379e-01, 4.3975431e-01,
        7.0742778e-02, 7.6310523e-03, 9.1167793e-05],
       [8.3612691e-04, 2.1512672e-02, 4.6484405e-01, 4.3667465e-01,
        7.1607701e-02, 4.4788276e-03, 4.6033845e-05],
       [7.2735804e-04, 2.3812875e-02, 4.4678798e-01, 4.4279417e-01,
        7.6696329e-02, 9.0636732e-03, 1.1746601e-04],
       [2.2626601e-03, 7.5873800e-02, 5.5353588e-01, 3.2105145e-01,
        4.1647054e-02, 5.5334745e-03, 9.5630188e-05],
       [3.5708022e-04, 9.3725324e-03, 9.3674362e-02, 5.5452627e-01,
        3.0872783e-01, 3.2905970e-02, 4.3587925e-04]], dtype=float32)

In [15]:
y_pred_label = np.argmax(y_pred_proba, axis=-1) + 3
y_pred_label[:5]

array([6, 5, 5, 5, 6])

In [16]:
# 제출 양식에 맞게 정리
submission['quality'] = y_pred_label.astype(int)
submission.head()

Unnamed: 0,index,quality
0,0,6
1,1,5
2,2,5
3,3,5
4,4,6


In [17]:
# 제출파일 저장
# submission.to_csv(wine_path + 'wine_dnn_001.csv', index=False)