In [None]:
import warnings
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

from PIL import Image as im

# Machine Learning
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('data/digit-recognizer/train.csv')
data_test = pd.read_csv('data/digit-recognizer/test.csv')
data

First, we need to see if dataset is balanced. It's easy to see from below plot that our dataset is indeed balanced.

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Histogram(x=data.label)
)

In [None]:
img_w, img_h = 28, 28
image_array = data.iloc[3,1:].to_numpy(dtype=np.uint8)
image_array = np.reshape(image_array, (img_w, img_h))
image = im.fromarray(image_array)
image

It will be very helpfull to see how those numbers looks like. We can display couple of them easily

In [None]:
fig = plt.figure(figsize=(8, 8))
columns, rows = 5, 6
img_w, img_h = 28, 28

for i in range(1, columns*rows + 1):
    image_array = image_array = data.iloc[i-1,1:].to_numpy(dtype=np.uint8)
    image_array = np.reshape(image_array, (img_w, img_h))
    image = im.fromarray(image_array)
    fig.add_subplot(rows, columns, i)
    plt.imshow(image)
plt.show()

In [None]:
# Check if there is any missing value
data.isnull().sum()

# Train and test sets

In [None]:
# Divide dataset into train and test sets
X, y = data.iloc[:,1:], data.label

scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

# Models Evaluation

In [None]:
# Random Forest
start = time.time()
classificator_RF = RandomForestClassifier()
classificator_RF.fit(X_train, y_train)
y_pred_RF = classificator_RF.predict(X_test)
end = time.time()
accuracy_RF = accuracy_score(y_test, y_pred_RF)

In [None]:
end - start

In [None]:
accuracy_RF

In [None]:
# Light Gradient Boost Classifier
start = time.time()
classificator_LGBMC = LGBMClassifier()
classificator_LGBMC.fit(X_train, y_train)
y_pred_LGBMC = classificator_LGBMC.predict(X_test)
end = time.time()
accuracy_LGBMC = accuracy_score(y_test, y_pred_LGBMC)

In [None]:
end - start

In [None]:
accuracy_LGBMC

After couple of tests we can see, random forest and light gradient boost perform best of all models. Our models have nice accuracy and speed even if this is just classical algorithms. From now we will evaluate only them.

In [None]:
print("Random Forest score: {}".format(accuracy_RF))
print("Light Gradient Boost Classifier score: {}".format(accuracy_LGBMC))

We are going to compute cross-validation score for our two raw models

In [None]:
# scores_RF = cross_val_score(classificator_RF, X, y, cv=5)
# scores_LGBMC = cross_val_score(classificator_LGBMC, X, y, cv=5)
# print("Random Forest CV score: {}".format(scores_RF.mean()))
# print("Light Gradient Boost Classifier CV score: {}".format(scores_LGBMC.mean()))

Light boost still perform better than random forest. Let's make last test and visualize confusion matrix.

In [None]:
# Confusion matrix for random forest
matrix_RF = confusion_matrix(y_test, y_pred_RF, normalize='true')
plt.figure(figsize = (10, 7))
sns.heatmap(matrix_RF, annot=True)

In [None]:
# Confusion matrix for light gradient boost
matrix_LGBMC = confusion_matrix(y_test, y_pred_LGBMC, normalize='true')
plt.figure(figsize = (10, 7))
sns.heatmap(matrix_LGBMC, annot=True)

Light gradient boost wins in every test. That's why we will use this algorithm to recognize our digits

# Parameters Tuning

To find best hiperparameters we will use GridSearchCV

In [None]:
classificator_LGBMC_new = LGBMClassifier(max_bin=235)

param_grid = {
    'num_iterations': [200, 250, 350, 400],  
}
n_iter_search = 54
random_search = RandomizedSearchCV(classificator_LGBMC_new, param_distributions=param_grid, n_iter=n_iter_search, cv=2, verbose=2)
start = time.time()
random_search.fit(X, y)

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))

In [None]:
random_search.best_score_

In [None]:
random_search.best_params_

Best {'num_iterations': 200, 'max_bin': 255, 'learning_rate': 0.1}

{'num_iterations': 300, 'max_bin': 235, 'learning_rate': 0.1} 0.9736666666666667

# Prediction

In [None]:
data_test

First we will train algorithm with parameters we found

In [None]:
data_test

In [None]:
clf = LGBMClassifier(num_iterations=300, max_bin=235, learning_rate=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(data_test)

In [None]:
y_pred.size

In [None]:
my_submission = pd.DataFrame({'ImageId': data_test.index+1, 'Label': y_pred})
my_submission.to_csv('submission.csv', index=False)