In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Raw Data Loading
df = pd.read_csv('../data/digit-recognizer/train.csv')
display(df.head(),df.shape)

# 데이터 명세를 파악하기
# 각 픽셀의 값은 0 ~ 255 크면 클수록 어두운 색상
# 결측치, 이상치 확인 > 현재 데이터에는 이상치와 결측치가 없음

In [None]:
# 이미지 확인
img_data = df.drop('label', axis=1, inplace=False).values

fig = plt.figure()
fig_arr = [] # 10개의 subplot을 만들고 리스트에 저장

for n in range(10):
    fig_arr.append(fig.add_subplot(2,5,n+1))
    fig_arr[n].imshow(img_data[n].reshape(28,28), cmap='Greys', 
                     interpolation='nearest')
plt.tight_layout()
plt.show()

In [None]:
# 데이터 분할
x_data_train, x_data_test, t_data_train, t_data_test = \
train_test_split(df.drop('label', axis=1,inplace=False), df['label'], 
                 test_size=0.3, random_state=0)

# 데이터 정규화
scaler = MinMaxScaler()
scaler.fit(x_data_train)

x_data_train_norm = scaler.transform(x_data_train)
x_data_test_norm = scaler.transform(x_data_test)

In [None]:
# Tensorflow 구현
sess = tf.Session()

# one-hot encoding
t_data_train_onehot = sess.run(tf.one_hot(t_data_train, depth=10))
t_data_test_onehot = sess.run(tf.one_hot(t_data_test, depth=10))

# placeholder
X = tf.placeholder(shape=[None,784], dtype=tf.float32)
T = tf.placeholder(shape=[None,10], dtype=tf.float32)

# Weight & bias
W = tf.Variable(tf.random.normal([784,10]), name='weight')
b = tf.Variable(tf.random.normal([10]), name='bias')

# Hypothesis
logit = tf.matmul(X,W) + b
H = tf.nn.softmax(logit)  # softmax activation function

# loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logit,
                                                                 labels=T))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss)


# parameter
num_of_epoch = 1000
batch_size = 100

# 학습용 함수
def run_train(sess,train_x, train_t):
    print('### 학습 시작 ###')
    # 초기화
    sess.run(tf.global_variables_initializer())
    
    for step in range(num_of_epoch):
        total_batch = int(train_x.shape[0] / batch_size)
        
        for i in range(total_batch):
            batch_x = train_x[i*batch_size:(i+1)*batch_size]
            batch_t = train_t[i*batch_size:(i+1)*batch_size]           
            _, loss_val = sess.run([train,loss],
                                   feed_dict={X: batch_x,
                                              T: batch_t})
            
        if step % 100 == 0:
            print('Loss : {}'.format(loss_val))
    print('### 학습 종료 ###')
    
# Accuracy 측정(정확도)    
predict = tf.argmax(H,1)
correct = tf.equal(predict, tf.argmax(T,1))
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

In [None]:
# learning
run_train(sess,x_data_train_norm, t_data_train_onehot)

In [None]:
# 학습때도 데이터를 배치처리를 해서 했으므로 성능평가도 마찬가지로 진행해야함
print('### Test Data Set을 이용하여 성능평가! ###')
result_t = sess.run(accuracy, feed_dict={X:x_data_test_norm,
                                       T:t_data_test_onehot})
print('Accuracy : {}'.format(result_t))

In [None]:
# learning with kfold
cv = 5
results = []

kf = KFold(n_splits=cv, shuffle=True)

for train_idx, val_idx in kf.split(x_data_train_norm):
    training_x = x_data_train_norm[train_idx]
    training_t = t_data_train_onehot[train_idx]
    
    val_x = x_data_train_norm[val_idx]
    val_t = t_data_test_onehot[val_idx]
    
    run_train(sess, training_x, training_t)
    results.append(sess.run(accuracy, feed_dict={X: val_x, T: val_t}))

print('측정한 각각의 결과값 : {}'.format(results))
print('최종 k-fold 교차검증을 사용한 Accuracy : {}'.format(np.mean(results)))

In [None]:
# 만약 precision, recall, f1, accuracy를 각각 구하고 싶다면?
# 첫번쨰 인자로 정답이 들어가야함(one-hot encoding 안한 값)
target_names = ['num 0', 'num 1', 'num 2', 'num 3', 'num 4', 
                'num 5', 'num 6', 'num 7', 'num 8', 'num 9']
result = classification_report(t_data_test, 
                      sess.run(predict, feed_dict={X:x_data_test_norm}),
                      target_names = target_names)
print(result)

In [None]:
# sample code로 confusion matrix 출력하기
# 3개의 label이 있는 multinomial classification
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]

print(confusion_matrix(y_true, y_pred))

In [None]:
print(confusion_matrix(t_data_test, 
                      sess.run(predict, feed_dict={X:x_data_test_norm})))

In [None]:
# 실제 이미지를 가지고 예측하기
import cv2
from PIL import Image

# 사진 가져오서 흑백으로 변환
my_img = cv2.imread('../data/digit-recognizer/sample.jpg', cv2.IMREAD_COLOR)
print(my_img.shape)
im_grey = cv2.cvtColor(my_img, cv2.COLOR_BGR2GRAY)
print(im_grey.shape)
cv2.imwrite('../data/digit-recognizer/sample_grey.jpg', im_grey)

# 이미지 가져와서 화면에 출력
img = Image.open('../data/digit-recognizer/sample_grey.jpg')
plt.imshow(img, cmap='Greys')
plt.show()

In [None]:
pixel = np.array(img)
pixel_inverse = 255 - pixel

plt.imshow(pixel_inverse, cmap='Greys')
plt.show()