# Kaggle competition
## Digit Recognizer

### imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
from sklearn.cross_validation import train_test_split

### Loggin

### Functions

In [4]:
def output(label, num_output=10):
    y = np.zeros(num_output)
    np.put(y, label, 1)
    return y.tolist()
    
def outputLayer(labels, num_output=10):
    return np.array(map(lambda y: output(y), labels))

def nextBatch(inf, sup, step, maxSize):
    if maxSize <= sup + step:
        return (0, step)
    
    return (inf + step, sup + step)

def to_digit(df, idx, i=28, j=28):
    return df.iloc[idx,:].reshape((i, j))

In [5]:
### tensor factories

In [6]:
# Weight Initialization

def weight_variable(shape):
    """
    returns a placeholders of the specified shape
    filled with initials random truncated normal values. 
    """
    
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    """ returns a placeholders of  bais initilazed at 0.1"""
    
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [7]:
# Convolution and Pooling

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

### Session

In [8]:
import tensorflow as tf
sess = tf.InteractiveSession()

In [9]:
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])

In [10]:
#W = tf.Variable(tf.zeros([784,10]))
#b = tf.Variable(tf.zeros([10]))

### the network

In [14]:
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])

In [15]:
x_image = tf.reshape(x, [-1,28,28,1])

In [16]:
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

In [17]:
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

In [18]:
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

In [19]:
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

In [20]:
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

### Datasets

In [21]:
DATA = pd.DataFrame.from_csv("data/train.csv").astype(np.float32)
DATA = DATA.apply(lambda d: d / 255.0)#[:-1] # last line contains Nan

In [22]:
DATA_TRAIN, DATA_VALID = train_test_split(DATA, test_size=0.20, random_state=42)

In [38]:
DATA_TEST = pd.DataFrame.from_csv("data/test.csv", index_col=None).astype(np.float32).apply(lambda d: d / 255.0)

In [24]:
print "Size of the dataset DATA_TRAIN : %s"%len(DATA_TRAIN)
print "Size of the dataset DATA_VALID : %s"%len(DATA_VALID)

Size of the dataset DATA_TRAIN : 33600
Size of the dataset DATA_VALID : 8400


In [25]:
print "Size of the dataset DATA_TESTS : %s"%len(DATA_TEST)

Size of the dataset DATA_TESTS : 28000


In [26]:
print "List of Nan values in the training set: ",filter(lambda y: y != 0, DATA.isnull().sum())
print "List of Nan values in the testing  set: ",filter(lambda y: y != 0, DATA_TEST.isnull().sum())

List of Nan values in the training set:  []
List of Nan values in the testing  set:  []


### Training

In [27]:
step = 50
inf, sup = (0, step)
dataLenght = len(DATA_TRAIN)

In [28]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.initialize_all_variables())

for i in range(20001):
    #batch = mnist.train.next_batch(50)
    
    train_step.run(feed_dict={x: DATA_TRAIN[inf:sup].as_matrix(),
                              y_: outputLayer(DATA_TRAIN.index.tolist()[inf:sup]),
                              keep_prob: 0.5})
    
    if i%2000 == 0:
        
        print("- %s ) Train accuracy %g"%(i, accuracy.eval(
                feed_dict={
                    x: DATA_TRAIN.as_matrix(),
                    y_: outputLayer(DATA_TRAIN.index.tolist()),
                    keep_prob: 1.0}
            )))
    
    inf, sup = nextBatch(inf, sup, step, dataLenght)

- 0 ) Train accuracy 0.0982738
- 2000 ) Train accuracy 0.980774
- 4000 ) Train accuracy 0.991667
- 6000 ) Train accuracy 0.994881
- 8000 ) Train accuracy 0.997381
- 10000 ) Train accuracy 0.998363
- 12000 ) Train accuracy 0.999256
- 14000 ) Train accuracy 0.999673
- 16000 ) Train accuracy 0.999613
- 18000 ) Train accuracy 0.999851
- 20000 ) Train accuracy 0.999732


### Evaluation

In [32]:
print("Error on the validation set: "%accuracy.eval(
        feed_dict = {
            x: DATA_TRAIN.as_matrix(),
            y_: outputLayer(DATA_TRAIN.index.tolist()),
            keep_prob: 1.0}
    ))

### Prediction

In [61]:
prediction = tf.argmax(y_conv,1)

In [64]:
PRED_LABEL = prediction.eval(feed_dict={x: DATA_TEST.as_matrix(), keep_prob: 1.0})

In [65]:
# writing the result
df_result = pd.DataFrame({'ImageId': range(1, len(PRED_LABEL)+1), 'Label': PRED_LABEL})
df_result.to_csv('data/prediction_three.csv', index=False)