# Digit Recognizer | Kaggle
https://www.kaggle.com/c/digit-recognizer

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
import scipy.optimize as opt

## Load data

In [3]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = df.iloc[:,1:]
y = df.iloc[:,:1]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)



In [5]:
initial_X = X_train.values.astype('float64')
initial_y = y_train.values.astype('float64')
print(initial_X.shape)
print(initial_y.shape)

(29399, 784)
(29399, 1)


In [6]:
np.unique(initial_y)

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(initial_X)

In [8]:
def sigmoid(z):
    return 1/(1+np.exp(-1*z))

def compute_cost(X_values, y_values, theta, lambda_value):
    m = len(X_values)
    h = sigmoid(X_values.dot(theta))
    
    y1 = y_values[y_values==1]
    y0 = y_values[y_values==0]
    h1 = h[y_values==1]
    h0 = h[y_values==0]
        
    cost1 = -1*y1.T.dot(np.log(h1))
    cost0 = -((1-y0).T).dot(np.log(1-h0))    
    cost = (cost1+cost0)/m
    newtheta = np.copy(theta)
    newtheta[0]=0
    cost = cost +  lambda_value* (newtheta.T.dot(newtheta))/(2*m)
    grad = X_values.T.dot(h-y_values)/m
    grad = grad + lambda_value* newtheta/m
    return cost, grad

def costFunction(theta, X_values, y_values, lambda_value):
    theta = theta.reshape(-1, 1)
    cost, grad = compute_cost(X_values, y_values, theta, lambda_value)
    return cost

def gradFunction(theta, X_values, y_values, lambda_value):
    theta = theta.reshape(-1, 1)
    cost, grad = compute_cost(X_values, y_values, theta, lambda_value)
    grad = grad.reshape(-1)
    return grad

def oneVsAll(X_values, y_values, num_labels, lambda_value):
    m = X_values.shape[0]
    n = X_values.shape[1]
    all_theta = np.zeros([num_labels, n + 1])
    X = np.insert(X_values, 0, 1, axis=1)
    
    for i in range(num_labels):
        y = (y_values==i).astype(int)
        initial_theta = np.zeros([n + 1, 1])
        print('Traing {}-th class'.format(i))
        Result = opt.minimize(fun = costFunction, x0 = initial_theta, args = (X, y, lambda_value), 
                              method = 'BFGS', jac = gradFunction, options={'maxiter': 400, 'disp': True})
        all_theta[i] = Result.x.T.reshape(-1)
        
    return all_theta

def predictOneVsAll(theta, X_values):
    m = X_values.shape[0]
    num_labels = theta.shape[0]

    p = np.zeros([m,1]) # class, m X num_labels
    X = np.insert(X_values, 0, 1, axis=1)
    
    z = X.dot(theta.T) # m X num_labels = m X n * n X num_labels 
    h = sigmoid(z);
    
    pc = np.argmax(h, axis=1)
    p = pc.reshape(-1,1)
    return p

In [9]:
# thetas = oneVsAll(initial_X, initial_y.reshape(-1,1), 10, 1)

### if x is not normalized, something go wrong?

In [10]:
thetas = oneVsAll(X_scaled, initial_y.reshape(-1,1), 10, 1)

Traing 0-th class
Optimization terminated successfully.
         Current function value: 0.018550
         Iterations: 307
         Function evaluations: 309
         Gradient evaluations: 309
Traing 1-th class
Optimization terminated successfully.
         Current function value: 0.019926
         Iterations: 277
         Function evaluations: 278
         Gradient evaluations: 278
Traing 2-th class
Optimization terminated successfully.
         Current function value: 0.062962
         Iterations: 386
         Function evaluations: 390
         Gradient evaluations: 390
Traing 3-th class
Optimization terminated successfully.
         Current function value: 0.074756
         Iterations: 383
         Function evaluations: 385
         Gradient evaluations: 385
Traing 4-th class
Optimization terminated successfully.
         Current function value: 0.043951
         Iterations: 359
         Function evaluations: 362
         Gradient evaluations: 362
Traing 5-th class
         Current 

In [11]:
pp = predictOneVsAll(thetas, X_scaled)
np.mean(pp==initial_y.reshape(-1,1))

0.93231062281029964

In [12]:
test_X = X_test.values.astype('float64')
test_y = y_test.values.astype('float64')
print(test_X.shape)
print(test_y.shape)

(12601, 784)
(12601, 1)


In [13]:
test_X_scaled = scaler.transform(test_X)
p2 = predictOneVsAll(thetas, test_X_scaled)
np.mean(p2==test_y.reshape(-1,1))

0.91476866915324184

## Using sklearn.linear_model.LogisticRegression

In [14]:
from sklearn import linear_model
logi_reg = linear_model.LogisticRegression()
logi_reg.fit(X_scaled, initial_y.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
predict = logi_reg.predict(test_X_scaled)
np.mean(predict==test_y.reshape(-1))

0.91500674549638916

## Result for Kaggle (my_predict = 0.91329, sk_predict = 0.91414)

In [16]:
test_data=pd.read_csv('data/test.csv')
test_data.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
final_test_X = test_data.values.astype('float64')
final_test_X.shape

(28000, 784)

In [18]:
final_test_X_scaled = scaler.transform(final_test_X)

In [19]:
my_predict = predictOneVsAll(thetas, final_test_X_scaled)
my_predict.shape

(28000, 1)

In [20]:
sk_predict = logi_reg.predict(final_test_X_scaled)
sk_predict.shape

(28000,)

In [21]:
np.sum(my_predict==sk_predict.reshape(-1,1))

27960

In [22]:
my_predict

array([[2],
       [0],
       [9],
       ..., 
       [3],
       [9],
       [2]], dtype=int64)

In [23]:
sk_predict

array([ 2.,  0.,  9., ...,  3.,  9.,  2.])

In [24]:
int_sk_predict = sk_predict.astype('int64')

In [25]:
int_sk_predict

array([2, 0, 9, ..., 3, 9, 2], dtype=int64)

In [26]:
re = pd.DataFrame(int_sk_predict)

In [27]:
re.head()

Unnamed: 0,0
0,2
1,0
2,9
3,7
4,3


In [28]:
re.index+=1
re.columns=['Label']
re.head()

Unnamed: 0,Label
1,2
2,0
3,9
4,7
5,3


In [29]:
re.index.name='ImageId'

In [30]:
re.head()

Unnamed: 0_level_0,Label
ImageId,Unnamed: 1_level_1
1,2
2,0
3,9
4,7
5,3


In [31]:
re.to_csv('data/results.csv', header=True)