In [38]:
import numpy as np
import pandas as pd

from keras import models
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
from keras.utils import to_categorical  # 원핫인코딩을 할 거다

In [39]:
# csv는 판다스로 먼저 불러오자
# 일단 지금까지는 판다스 DataFrame
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [40]:
# 항상 shape를 먼저 찍어보자
# ndarray 아니고 DataFrame이긴 한데 여기도 shape가 있다
print(train.shape)
print(test.shape)

(42000, 785)
(28000, 784)


In [41]:
# label 1개 붙고 픽셀 748개 뒤따라 붙는다
# 총 42000개
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
# 레이블 없이 그냥 784픽셀 이미지만 28000개
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
train_images = train.drop('label', axis=1).values  # 이름이 label인 행/열을 삭제. axis=1이니까 열 삭제. 하고 values를 취하면 ndarray로 값만 튀어나온다
train_images = train_images.reshape((-1, 28, 28, 1))
"""
reshape를 할 때 [-1, 28, 28, 1]은 [batch_size, width, height, channel] 을 의미한다.
batch_size를 -1로 두면 자동으로 batch_size를 조정하겠다는 뜻이다. 따라서 파이프라인을 변경해서 batch size를 변경해야하더라도 reshpae의 batch size 크기를 바꿔줄 필요가 없게된다.
"""
train_images = train_images.astype('float32') / 255  # 늘 하던 대로 0~1사이 값이 되도록 정규화
train_images.shape

(42000, 28, 28, 1)

In [44]:
# DataFrame['칼럼명'] 으로 열 하나를 가져올 수 있다
# DataFrame.loc[:, '칼럼명'] 도 똑같다
# DataFrame.loc 사용법 알아볼 것

train_labels = to_categorical(train['label'])
train_labels.shape

(42000, 10)

In [45]:
test_images = test.values
test_images = test_images.reshape((-1, 28, 28, 1))
test_images = test_images.astype('float32') / 255
test_images.shape

(28000, 28, 28, 1)

In [46]:
model = models.Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 3, 3, 64)          36928     
Total params: 55,744
Trainable params: 55,744
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(10, activation='softmax'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 3, 3, 64)          36928     
_________________________________________________________________
flatten_2 (Flatten)          (None, 576)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)               

In [48]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.fit(train_images, train_labels, epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1893b910308>

In [49]:
test_labels = model.predict(test_images)
test_labels.shape

(28000, 10)

In [50]:
# ndarray.argmax : axis에 해당하는 값들 중 가장 큰 값의 인덱스들을 반환
pred = test_labels.argmax(axis=1)  # y축 기준 쭉 내려가면서 큰 값의 인덱스들만 남긴다
pred.shape

(28000,)

In [51]:
"""
Your submission file should be in the following format: For each of the 28000 images in the test set, output a single line containing the ImageId and the digit you predict. For example, if you predict that the first image is of a 3, the second image is of a 7, and the third image is of a 8, then your submission file would look like:

ImageId,Label
1,3
2,7
3,8
(27997 more lines)
"""
# DataFrame으로 만들고 csv로 만들어서 내자
submission = pd.DataFrame({
    "ImageId": range(1, 28001),
    "Label": pred
})
submission.to_csv('../output/SimpleCNN_submission.csv', index=False)  # index=False 안 하면 기본적으로 index 칼럼이 하나 더 생긴다