<a href="https://www.kaggle.com/code/adelinmil/road-to-the-99?scriptVersionId=140796883" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### IMPORTS

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (15, 7)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold

from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import random

# from warnings import simplefilter
# simplefilter('ignore', category = 'UserWarning')

### EDA

In [None]:
tr = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
ts = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
tr.head()

In [None]:
X = tr.drop('label', axis = 1)
y = tr.label

In [None]:
fig, axes = plt.subplots(nrows = 3, ncols = 3)

for ax in axes.ravel():
    idx = random.randint(0, len(X))
    ax.imshow(X.iloc[idx].to_numpy().reshape(28, 28))
    label_x = -0.30
    ax.text(label_x, 0.35, r"Label:", color='black', rotation='vertical', transform=ax.transAxes, fontsize = 12)
    ax.text(label_x, 0.7, y.iloc[idx], color='blue', rotation='vertical', transform=ax.transAxes, fontsize = 12)

In [None]:
(y.value_counts() / len(y) * 100).to_frame().style.background_gradient()

In [None]:
sns.countplot(data = tr, x = 'label')

## **1ST MILESTONE** : baseline model (96%)

In [None]:
X_normalized = X / 255.0
ts_normalized = ts / 255.0

In [None]:
clf = KNeighborsClassifier()

kf = KFold(n_splits = 10)
scores = cross_val_score(clf, X_normalized, y, scoring = 'accuracy', cv = kf)

In [None]:
scores.mean() * 100

In [None]:
clf.fit(X_normalized, y)
predictions = clf.predict(X_normalized)

In [None]:
# Visualize Predictions

fig, axes = plt.subplots(nrows = 3, ncols = 3)

for ax in axes.ravel():
    idx = random.randint(0, len(X)) # generate random number 
    ax.imshow(X.iloc[idx].to_numpy().reshape(28, 28)) # display choosen image
    
    # Change prediction color if the prediction differs from the actual value
    pred_color = 'green' if predictions[idx] == y.iloc[idx] else 'red'
    
    label_x = -0.4
    ax.text(label_x - 0.2, 0.35, r"Actual:", color='black', rotation='vertical', transform=ax.transAxes, fontsize = 10)
    ax.text(label_x - 0.2, 0.7, y.iloc[idx], color='blue', rotation='vertical', transform=ax.transAxes, fontsize = 10)
    ax.text(label_x, 0.3, r"Predicted:", color='black', rotation='vertical', transform=ax.transAxes, fontsize = 10)
    ax.text(label_x, 0.85, predictions[idx], color=pred_color, rotation='vertical', transform=ax.transAxes, fontsize = 10)

In [None]:
submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv', usecols = ['ImageId'])
submission['label'] = clf.predict(ts_normalized)

submission.to_csv('000_submission.csv', index = False) #LB SCORE: 0.967

## **2ND MILESTONE** : dimetionality reduction (97.5%)

In [None]:
X_features = X_normalized.copy()

In [None]:
pca = PCA()
pca.fit_transform(X_features)
pca_variance = pca.explained_variance_

In [None]:
# Note: less than 100 features explain 99% of the variance
plt.figure(figsize = (15, 7))
plt.bar(range(len(X_features.columns)), pca_variance, alpha=0.5, align='center', label='individual variance')
plt.legend()
plt.ylabel('Variance ratio')
plt.xlabel('Principal components')
plt.show()

In [None]:
# Apply princple component analysis
pca = PCA(n_components = 40)
X_transformed = pca.fit_transform(X_features)
ts_transformed = pca.transform(ts_normalized)

In [None]:
clf = KNeighborsClassifier()

kf = KFold(n_splits = 10)
scores = cross_val_score(clf, X_transformed, y, scoring = 'accuracy', cv = kf)
scores.mean() * 100

In [None]:
clf.fit(X_transformed, y)

In [None]:
submission['label'] = clf.predict(ts_transformed)

submission.to_csv('001_submission.csv', index = False) #LB SCORE: 0.97478

## **3RD MILESTONE** : let's dive deep (97%)

In [None]:
X_array = np.array(X_normalized)
y_array = tf.keras.utils.to_categorical(y)

ts_array = np.array(ts_normalized)

In [None]:
nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(120, activation = 'relu'),
    tf.keras.layers.Dense(10, activation = 'softmax')
])

In [None]:
nn_model.compile(
    loss = 'categorical_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [None]:
earlystopping_cb = tf.keras.callbacks.EarlyStopping(patience = 3)

history = nn_model.fit(
    X_array, 
    y_array, 
    validation_split = 0.2, 
    epochs = 25, 
    callbacks = [earlystopping_cb]
)

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Neural Network model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Neural Network model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
predictions = np.argmax(nn_model.predict(ts_array), axis = 1)

In [None]:
submission['label'] = predictions

submission.to_csv('002_submission.csv', index = False) #LB SCORE: 0.97271

## **4TH MILESTONE** : CNNs (98%)

In [None]:
X_reshaped = X_normalized.to_numpy().reshape(-1, 28, 28, 1)
ts_reshaped = ts_normalized.to_numpy().reshape(-1, 28, 28, 1)

In [None]:
cnn_model = tf.keras.Sequential([
    
    tf.keras.layers.Conv2D(32, kernel_size = (5, 5), activation = 'relu', input_shape = (28, 28, 1)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides = (2, 2)),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(120, activation = 'relu'),
    tf.keras.layers.Dense(10, activation = 'softmax')
])

In [None]:
cnn_model.compile(
    loss = 'categorical_crossentropy',
    optimizer = tf.keras.optimizers.Adam(), 
    metrics = ['accuracy']
)

In [None]:
history = cnn_model.fit(
    X_reshaped,
    y_array, 
    validation_split = 0.2,
    epochs = 25, 
    callbacks = [earlystopping_cb]
)

In [None]:
submission['label'] = np.argmax(cnn_model.predict(ts_reshaped), axis = 1)

submission.to_csv('003_submission.csv', index = False) #LB SCORE 0.98928

### <center>Upvote if you found this notebook helpful😄</center>

## **FINAL DISTINATION** : GOING EVEN DEEPER + DATA AUGMENTAION (99%)

In [None]:
final_model = tf.keras.Sequential([
    
    tf.keras.layers.Conv2D(64, kernel_size = (5, 5), activation = 'relu', padding = 'same', kernel_initializer='he_normal', input_shape = (28, 28, 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(64, kernel_size = (5, 5), activation = 'relu', padding = 'same', kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    
    tf.keras.layers.Conv2D(64, kernel_size = (3, 3), activation = 'relu', padding = 'same', kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(64, kernel_size = (3, 3), activation = 'relu', padding = 'same', kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    
    
    tf.keras.layers.Conv2D(64, kernel_size = (3, 3), activation = 'relu', padding = 'same', kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.25),
    
    
    tf.keras.layers.Flatten(),
    
    tf.keras.layers.Dense(256, activation = 'relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.25),
    
    tf.keras.layers.Dense(10, activation = 'softmax')
])

In [None]:
final_model.compile(
    loss = 'categorical_crossentropy',
    optimizer = tf.keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08),
    metrics = ['accuracy']
)

In [None]:
datagen = ImageDataGenerator(
        featurewise_center=False,  
        samplewise_center=False,  
        featurewise_std_normalization=False,  
        samplewise_std_normalization=False, 
        zca_whitening=False,  
        rotation_range=10,  
        zoom_range = 0.1, 
        width_shift_range=0.1,  
        height_shift_range=0.1,  
        horizontal_flip=False,  
        vertical_flip=False,
        validation_split = 0.2) 

In [None]:
datagen.fit(X_reshaped)

In [None]:
learning_rate_reduction = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [None]:
final_model.fit(
    datagen.flow(X_reshaped, y_array, subset='training'),
    validation_data = datagen.flow(X_reshaped, y_array, batch_size=8, subset='validation'),
    epochs=25,
    callbacks = [learning_rate_reduction]
         )

In [None]:
submission['label'] = np.argmax(final_model.predict(ts_reshaped), axis = 1)
 
submission.to_csv('submission.csv', index = False) #LB SCORE 0.99389