In [2]:
from pathlib import Path
from joblib import dump, load

import numpy as np
import pandas as pd

from skimage.io import imread
from skimage.color import rgb2gray
from skimage.restoration import denoise_nl_means
from skimage import img_as_float, img_as_bool, img_as_ubyte

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler  # Нужен для LR и SG

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier  # вероятность Быстро
from sklearn.neighbors import KNeighborsClassifier  # вероятность 
from sklearn.linear_model import LogisticRegression  # вероятность Быстро
from sklearn.linear_model import SGDClassifier  # Может обучаться онлайн. Быстро
from sklearn.decomposition import PCA


IMG_SIZE = 64
DATASET_PATH = Path('ML/dataset')

In [3]:
letters = []  # np.array([], dtype=np.uint8)
flat_images = []

for folder in range(1, 34):
    path_gen = Path(Path.cwd().parent / DATASET_PATH / str(folder)).glob(
        '*.jpg')  # Создаем генератор путей картинок
    paths = [path for path in path_gen if path.is_file()]  # Записываем пути картинок
    for i in range(len(paths)):

        flat_images.append(img_as_ubyte(img_as_bool(img_as_ubyte(imread(paths[i])).ravel())))

        letters.append(folder)
            #letters = np.append(letters, folder)
            # Картинка представляется IMG_SIZE * IMG_SIZE признаками (пикселями),
            # в каждом из которых берем интенсивность белого)
print(f'Модель учится на {len(paths)} картинках 33 букв, размером {IMG_SIZE}x{IMG_SIZE}.')

Модель учится на 263 картинках 33 букв, размером 64x64.


In [4]:
X_train, X_test, y_train, y_test = train_test_split(flat_images, letters, test_size=0.3, random_state=1)
# Делим выборку на тренировочную и тестовую

In [5]:
pca = PCA(n_components=0.9)

In [7]:
dec_X_train = pca.fit_transform(X_train, y_train)

In [8]:
dec_X_test = pca.transform(X_test)

In [14]:
pca.n_components_

175

In [21]:
scaler = StandardScaler()
std_X_train = scaler.fit_transform(X_train)
std_X_test = scaler.transform(X_test)
# Стандартизируем выборки (нужно не для всех методов). Без обрезки лучше не использовать.

In [9]:
svm_clf = SVC(kernel='poly', degree=5, cache_size=1000, verbose=True) #C=0.01, kernel='linear', degree=1, cache_size=1000, verbose=True)
# лучший SVC-классификатор
# kernel='poly', degree=2, C=1, 

In [6]:
search = GridSearchCV(svm_clf, param_grid={#'C': [0.1, 1, 10],
                                          'degree': range(2,5),
                                          },
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)

In [10]:
svm_clf.fit(dec_X_train, y_train)

[LibSVM]

SVC(C=1.0, break_ties=False, cache_size=1000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=5, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

In [11]:
svm_clf.score(dec_X_test, y_test)

0.9984639016897081

In [60]:
search99.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.6min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_scor

In [61]:
search99.best_params_

{'n_estimators': 75}

In [62]:
search99.score(X_test, y_test)

0.9985812775160252

In [63]:
list(search99.predict_proba(X_test))

[array([0.01333333, 0.        , 0.01333333, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.09333333, 0.64      ,
        0.01333333, 0.        , 0.        , 0.05333333, 0.        ,
        0.        , 0.01333333, 0.        , 0.        , 0.01333333,
        0.        , 0.04      , 0.02666667, 0.        , 0.        ,
        0.02666667, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.02666667, 0.02666667]),
 array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.01333333, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.01333333,
        0.01333333, 0.        , 0.        , 0.        , 0.        ,
        0.01333333, 0.        , 0.        , 0.        , 0.93333333,
        0.01333333, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 

In [64]:
list(search99.predict(X_test))

[10,
 25,
 9,
 32,
 16,
 30,
 10,
 21,
 12,
 13,
 3,
 22,
 4,
 25,
 21,
 33,
 14,
 21,
 32,
 23,
 22,
 4,
 7,
 16,
 24,
 16,
 23,
 26,
 24,
 23,
 28,
 2,
 19,
 19,
 27,
 6,
 28,
 32,
 28,
 29,
 6,
 5,
 29,
 26,
 3,
 30,
 33,
 19,
 25,
 4,
 26,
 21,
 30,
 4,
 9,
 1,
 18,
 18,
 13,
 21,
 17,
 31,
 29,
 1,
 16,
 13,
 11,
 20,
 6,
 2,
 31,
 3,
 7,
 12,
 27,
 24,
 8,
 3,
 13,
 21,
 18,
 12,
 12,
 27,
 1,
 27,
 18,
 22,
 33,
 8,
 9,
 32,
 7,
 22,
 24,
 14,
 18,
 4,
 10,
 10,
 24,
 2,
 16,
 24,
 9,
 16,
 20,
 28,
 29,
 24,
 30,
 33,
 14,
 22,
 12,
 10,
 17,
 27,
 2,
 32,
 22,
 13,
 30,
 8,
 31,
 20,
 2,
 6,
 29,
 30,
 18,
 12,
 23,
 16,
 7,
 2,
 30,
 32,
 22,
 21,
 29,
 10,
 1,
 4,
 32,
 18,
 18,
 21,
 13,
 24,
 5,
 11,
 13,
 8,
 23,
 32,
 1,
 33,
 19,
 19,
 10,
 11,
 25,
 27,
 3,
 31,
 30,
 31,
 25,
 29,
 12,
 29,
 15,
 10,
 2,
 5,
 2,
 25,
 20,
 29,
 18,
 13,
 13,
 26,
 1,
 17,
 17,
 22,
 19,
 24,
 29,
 17,
 18,
 28,
 15,
 28,
 23,
 10,
 2,
 25,
 23,
 9,
 19,
 8,
 21,
 21,
 25,
 8,
 2,
 22,

In [44]:
rf_clf = RandomForestClassifier(random_state=1, n_jobs=-1)
# лучший RF-классификатор

In [45]:
search3 = GridSearchCV(rf_clf, param_grid={'n_estimators': [20, 30, 40, 50, 75, 100, 200, 500, 750, 1000],
                                          },
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)

In [46]:
search3.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_scor

In [47]:
search3.score(X_test, y_test)

0.9991171260850925

In [48]:
search3.best_params_

{'n_estimators': 750}

In [50]:
list(search3.predict_proba(X_test))

[array([0.008     , 0.        , 0.00133333, 0.00133333, 0.01866667,
        0.        , 0.00533333, 0.00666667, 0.13066667, 0.64      ,
        0.01066667, 0.004     , 0.00266667, 0.04      , 0.        ,
        0.01333333, 0.012     , 0.00133333, 0.        , 0.00533333,
        0.00266667, 0.00933333, 0.01066667, 0.00533333, 0.00666667,
        0.00666667, 0.        , 0.        , 0.        , 0.        ,
        0.008     , 0.044     , 0.00533333]),
 array([0.        , 0.00266667, 0.00133333, 0.        , 0.        ,
        0.004     , 0.        , 0.        , 0.00133333, 0.        ,
        0.        , 0.        , 0.00133333, 0.00133333, 0.00266667,
        0.00133333, 0.        , 0.00933333, 0.        , 0.        ,
        0.02266667, 0.        , 0.00933333, 0.        , 0.90266667,
        0.016     , 0.        , 0.        , 0.00266667, 0.00266667,
        0.01733333, 0.        , 0.00133333]),
 array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.  

In [51]:
y_test

[10,
 25,
 9,
 32,
 16,
 30,
 10,
 21,
 12,
 13,
 3,
 22,
 4,
 25,
 21,
 33,
 14,
 21,
 32,
 23,
 22,
 4,
 7,
 16,
 24,
 16,
 23,
 26,
 24,
 23,
 28,
 2,
 19,
 19,
 27,
 6,
 28,
 32,
 28,
 29,
 6,
 5,
 29,
 26,
 3,
 30,
 33,
 19,
 25,
 4,
 26,
 21,
 30,
 4,
 9,
 1,
 18,
 18,
 13,
 21,
 17,
 31,
 29,
 1,
 16,
 13,
 11,
 20,
 6,
 2,
 31,
 3,
 7,
 12,
 27,
 24,
 8,
 3,
 13,
 21,
 18,
 12,
 12,
 27,
 1,
 27,
 18,
 22,
 33,
 8,
 9,
 32,
 7,
 22,
 24,
 14,
 18,
 4,
 10,
 10,
 24,
 2,
 16,
 24,
 9,
 16,
 20,
 28,
 29,
 24,
 30,
 33,
 14,
 22,
 12,
 10,
 17,
 27,
 2,
 32,
 22,
 13,
 30,
 8,
 31,
 20,
 2,
 6,
 29,
 30,
 18,
 12,
 23,
 16,
 7,
 2,
 30,
 32,
 22,
 21,
 29,
 10,
 1,
 4,
 32,
 18,
 18,
 21,
 13,
 24,
 5,
 11,
 13,
 8,
 23,
 32,
 1,
 33,
 19,
 19,
 10,
 11,
 25,
 27,
 3,
 31,
 30,
 31,
 25,
 29,
 12,
 29,
 15,
 10,
 2,
 5,
 2,
 25,
 20,
 29,
 18,
 13,
 13,
 26,
 1,
 17,
 17,
 22,
 19,
 24,
 29,
 17,
 18,
 28,
 15,
 28,
 23,
 10,
 2,
 25,
 23,
 9,
 19,
 8,
 21,
 21,
 25,
 8,
 2,
 22,

In [10]:
knn_clf = KNeighborsClassifier(n_neighbors=6, weights='distance', n_jobs=-1)
# лучший kNN-классификатор

In [11]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=6, p=2,
                     weights='distance')

In [12]:
knn_clf.score(X_test, y_test)

0.9985935302390999

In [37]:
lr_clf = LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=1)  # solver='lbfgs'
# Для логистической регрессии лучше стандартизировать выборки.

In [38]:
search4 = GridSearchCV(lr_clf, param_grid={
                                           'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 1, 10, 100, 1000, 10000, 100000, 1000000],
                                          },
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)

In [39]:
search4.fit(std_X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.3min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=1, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 1, 10, 100,
                               1000, 10000, 100000, 1000000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=True)

In [40]:
search4.best_params_

{'C': 10000}

In [41]:
search4.score(std_X_test, y_test)

0.9973121956196638

In [43]:
list(search4.predict_proba(X_test))

[array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]),
 array([0.0000000e+000, 0.0000000e+000, 1.0000000e+000, 0.0000000e+000,
        0.0000000e+000, 0.0000000e+000, 0.0000000e+000, 0.0000000e+000,
        0.0000000e+000, 0.0000000e+000, 0.0000000e+000, 0.0000000e+000,
        0.0000000e+000, 0.0000000e+000, 0.0000000e+000, 2.3308727e-162,
        0.0000000e+000, 0.0000000e+000, 0.0000000e+000, 0.0000000e+000,
        0.0000000e+000, 0.0000000e+000, 

In [28]:
sg_clf = SGDClassifier(penalty='elasticnet', alpha=0.00001, random_state=42)

In [29]:
sg_clf.fit(X_train, y_train)

SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='elasticnet', power_t=0.5, random_state=42, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [30]:
sg_clf.score(X_test, y_test)

0.9963689179375453

In [56]:
best_clf = LogisticRegression(C=100, n_jobs=-1, random_state=42)

In [28]:
scaler2 = StandardScaler()
std_letters_data = scaler2.fit_transform(letters_data)
dump(scaler2, 'scaler.joblib')

['scaler.joblib']

In [57]:
best_clf.fit(std_X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [62]:
list(best_clf.predict_proba(X_test))[1]

array([9.17207715e-04, 9.95538174e-04, 1.53050475e-01, 1.00720897e-05,
       2.83775784e-03, 1.08727636e-03, 9.48280229e-03, 1.27757134e-03,
       4.85084003e-03, 1.07074904e-02, 6.81528607e-04, 3.38443972e-04,
       6.77978604e-01, 4.83489791e-05, 5.90951077e-04, 3.18642561e-03,
       3.29445655e-03, 2.66060215e-03, 3.27349404e-04, 1.62597701e-04,
       7.46015784e-03, 7.46607473e-03, 7.88481208e-04, 2.33002314e-02,
       7.38827600e-03, 1.91076950e-02, 4.36613709e-03, 3.45896714e-03,
       8.24042842e-04, 1.68885149e-03, 7.25749316e-03, 4.21521534e-02,
       2.55099280e-04])

In [61]:
best_clf.score(std_X_test, y_test)

0.9969390878481788

In [31]:
dump(best_clf, 'classifier.joblib')

['classifier.joblib']

In [32]:
test_array = np.zeros((15,15,28,28,3))

In [33]:
test_array.shape

(15, 15, 28, 28, 3)

In [34]:
test_array.reshape(225, 28, 28, 3).shape

(225, 28, 28, 3)