In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 200)
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
import tensorflow as tf
from tensorflow import keras

# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

## Defining files names
file_key_1 = 'Digit-Recognizer/train.csv'
file_key_2 = 'Digit-Recognizer/test.csv'
file_key_3 = 'Digit-Recognizer/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
sample = pd.read_csv(file_content_stream_3)



# Basic Exploration

In [3]:
print('Train dataset dimensions are:', train.shape)

print('Test dataset dimensions are:', test.shape)

Train dataset dimensions are: (42000, 785)
Test dataset dimensions are: (28000, 784)


In [48]:
train['label'].value_counts() / train.shape[0]

1    0.111524
7    0.104786
3    0.103595
9    0.099714
2    0.099452
6    0.098500
0    0.098381
4    0.096952
8    0.096738
5    0.090357
Name: label, dtype: float64

In [2]:
train.columns = [i for i in range(0, 785)]
train = np.array(train)

# Putting the data in the right format

In [3]:
img_rows, img_cols = 28, 28
num_classes = 10

def prep_data(raw):
    out_y = raw[:, 0]
#     out_y = keras.utils.to_categorical(y, num_classes)
    
    x = raw[:,1:]
    num_images = raw.shape[0]
    out_x = x.reshape(num_images, img_rows, img_cols, 1)
    out_x = out_x / 255
    return out_x, out_y

# fashion_file = "../input/fashionmnist/fashion-mnist_train.csv"
# fashion_data = np.loadtxt(fashion_file, skiprows=1, delimiter=',')
X, Y = prep_data(train)

# Set up code checking
# from learntools.core import binder
# binder.bind(globals())
# from learntools.deep_learning.exercise_7 import *
# print("Setup Complete")

In [4]:
# Define the model
model = tf.keras.models.Sequential([
        
        tf.keras.layers.Conv2D(16, (3,3), activation = 'relu', input_shape = (28, 28, 1)),
        tf.keras.layers.MaxPooling2D(2, 2),
        tf.keras.layers.Conv2D(32, (3,3), activation = 'relu'),
        tf.keras.layers.MaxPooling2D(2, 2),
        tf.keras.layers.Conv2D(64, (3,3), activation = 'relu'),
        tf.keras.layers.MaxPooling2D(2, 2),
        
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation = 'relu'),
#         tf.keras.layers.Dense(256, activation = 'relu'),
        tf.keras.layers.Dense(10, activation = 'softmax')
])

# Setup training parameters
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

# Train the model
print(f'\nMODEL TRAINING:')
model.fit(X, Y, epochs = 30, validation_split = 0.2)


MODEL TRAINING:
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f0e7709ec50>

In [5]:
## Predicting on test
img_rows = 28
img_cols = 28

test.columns = [i for i in range(0, 784)]
test = np.array(test)

num_images = test.shape[0]
test_images = test.reshape(num_images, img_rows, img_cols, 1)
test_images = test_images / 255

pred_labels = model.predict(test_images)

In [6]:
np.argmax(pred_labels, axis = 1)

array([2, 0, 9, ..., 3, 9, 2])

In [7]:
data_out = pd.DataFrame({'ImageId': [i for i in range(1, 28001)], 'Label': np.argmax(pred_labels, axis = 1)})
data_out.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,0
4,5,3


In [8]:
data_out.to_csv('submission_11.csv', index = False)

In [11]:
## Defining input and target
X = train.drop(columns = ['label'], axis = 1)
Y = train['label']

## Defining the hyper-parameter grid
logistic_param_grid = {'penalty': ['l1', 'l2', 'elasticnet'],
                       'C': [0.001, 0.01, 0.1, 1, 10, 100],
                       'solver': ['liblinear', 'sag', 'saga']}

## Performing grid search with 5 folds
logistic_grid_search = GridSearchCV(LogisticRegression(), logistic_param_grid, cv = 5, scoring = 'accuracy', n_jobs = -1, verbose = 3).fit(X, Y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


KeyboardInterrupt: 

In [12]:
28*28

784