In [5]:
from pathlib import Path
from joblib import dump, load

import numpy as np
import pandas as pd

from skimage.io import imread
from skimage.color import rgb2gray
from skimage.restoration import denoise_nl_means
from skimage import img_as_float

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler  # Нужен для LR и SG

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier  # вероятность Быстро
from sklearn.neighbors import KNeighborsClassifier  # вероятность 
from sklearn.linear_model import LogisticRegression  # вероятность Быстро
from sklearn.linear_model import SGDClassifier  # Может обучаться онлайн. Быстро

IMG_RESOLUTION = 32

In [6]:
letter_column = int(IMG_RESOLUTION * IMG_RESOLUTION)
letters_data = pd.DataFrame(columns=range(IMG_RESOLUTION * IMG_RESOLUTION + 1))
letters_data = letters_data.rename(columns={letter_column: 'letter'})

In [44]:
index = 0

for folder in range(1, 34):
    path_gen = Path(Path.cwd() / 'dataset' / str(folder)).glob('*.jpg')  # Создаем генератор путей картинок
    paths = [path for path in path_gen if path.is_file()]  # Записываем пути картинок
    for i in range(len(paths)):  
        letters_data.loc[index] = *denoise_nl_means(img_as_float(imread(paths[i]))).ravel(), folder  # denoise_nl_means
        # Картинка представляется 28*28 признаками (пикселями, в каждом из которых берем интенсивность белого)
        # Записываем информацию в датафрейм
        index += 1

In [45]:
letters_data.letter = letters_data.letter.map(int)  # Переводим номер буквы в int

In [46]:
letters_y = letters_data.pop('letter')  # Выделяем целевую переменную

In [47]:
X_train, X_test, y_train, y_test = train_test_split(letters_data, letters_y, test_size=0.3, random_state=1)
# Делим выборку на тренировочную и тестовую

In [48]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
540,0.001449,0.001522,0.001708,0.002102,0.002119,0.002124,0.00212,0.002189,0.002281,0.002339,...,0.004803,0.004724,0.004563,0.004356,0.004502,0.004766,0.004782,0.004813,0.004999,0.00507
4566,0.002759,0.002901,0.002816,0.002765,0.00263,0.002679,0.002763,0.002854,0.002972,0.003062,...,0.001445,0.001429,0.001176,0.000986,0.000868,0.000564,0.000559,0.000335,8.4e-05,0.0
1545,0.002995,0.002562,0.002849,0.003028,0.002193,0.00242,0.002827,0.003423,0.00368,0.004264,...,0.007447,0.007554,0.007482,0.007181,0.007037,0.007391,0.007367,0.007298,0.007319,0.007138
1036,0.000869,0.001389,0.001389,0.001194,0.00114,0.001245,0.001374,0.001562,0.002563,0.002658,...,0.005686,0.005842,0.005968,0.005768,0.005573,0.005843,0.000513,0.0,0.013035,0.007758
1047,0.00269,0.002939,0.002824,0.002844,0.002679,0.002705,0.002855,0.002995,0.003074,0.003334,...,0.004544,0.004543,0.004206,0.004453,0.004028,0.004516,0.004871,0.00478,0.004724,0.004721


In [49]:
scaler = StandardScaler()
std_X_train = scaler.fit_transform(X_train)
std_X_test = scaler.transform(X_test)
# Стандартизируем выборки (нужно не для всех методов). Без обрезки лучше не использовать.

In [50]:
svm_clf = SVC(kernel='poly', degree=2, C=1, cache_size=1000) #C=0.01, kernel='linear', degree=1, cache_size=1000, verbose=True)
# лучший SVC-классификатор

In [51]:
'''search = GridSearchCV(svm_clf, param_grid={'C': [0.00001, 0.001, 0.01, 0.1, 1, 100, 10000],
                                          'kernel': ['linear', 'poly'],
                                          'degree': range(1,4),
                                          },
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)'''

"search = GridSearchCV(svm_clf, param_grid={'C': [0.00001, 0.001, 0.01, 0.1, 1, 100, 10000],\n                                          'kernel': ['linear', 'poly'],\n                                          'degree': range(1,4),\n                                          },\n                       scoring='f1_macro',\n                     cv=KFold(n_splits=5, shuffle=True),\n                     n_jobs=-1, verbose=True)"

In [52]:
svm_clf.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=1000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [53]:
svm_clf.score(X_test, y_test)

0.9978213507625272

In [45]:
rf_clf = RandomForestClassifier(random_state=1, n_jobs=-1)
# лучший RF-классификатор

In [47]:
search3 = GridSearchCV(rf_clf, param_grid={'n_estimators': [20, 30, 40, 50, 75, 100, 200, 500, 750, 1000],
                                          },
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)

In [48]:
search3.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 12.3min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_scor

In [21]:
search3.score(X_test, y_test)

0.9994670929922729

In [49]:
search3.best_params_

{'n_estimators': 750}

In [15]:
knn_clf = KNeighborsClassifier(n_neighbors=6, weights='distance', n_jobs=-1)
# лучший kNN-классификатор

In [16]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=6, p=2,
                     weights='distance')

In [17]:
knn_clf.score(X_test, y_test)

0.9963689179375453

In [7]:
lr_clf = LogisticRegression(solver='liblinear', n_jobs=-1, random_state=1)
# Для логистической регрессии лучше стандартизировать выборки.

In [8]:
search4 = GridSearchCV(lr_clf, param_grid={'penalty': ['l1', 'l2', 'elasticnet'],
                                           'C': [0.00001, 1, 10000],
                                          },
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)

In [None]:
search4.fit(std_X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
search4.score(std_X_test, y_test)

0.9992067634049102

In [None]:
search4.best_params_

{'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}

In [28]:
sg_clf = SGDClassifier(penalty='elasticnet', alpha=0.00001, random_state=42)

In [29]:
sg_clf.fit(X_train, y_train)

SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='elasticnet', power_t=0.5, random_state=42, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [30]:
sg_clf.score(X_test, y_test)

0.9963689179375453

In [56]:
best_clf = LogisticRegression(C=100, n_jobs=-1, random_state=42)

In [28]:
scaler2 = StandardScaler()
std_letters_data = scaler2.fit_transform(letters_data)
dump(scaler2, 'scaler.joblib')

['scaler.joblib']

In [57]:
best_clf.fit(std_X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [62]:
list(best_clf.predict_proba(X_test))[1]

array([9.17207715e-04, 9.95538174e-04, 1.53050475e-01, 1.00720897e-05,
       2.83775784e-03, 1.08727636e-03, 9.48280229e-03, 1.27757134e-03,
       4.85084003e-03, 1.07074904e-02, 6.81528607e-04, 3.38443972e-04,
       6.77978604e-01, 4.83489791e-05, 5.90951077e-04, 3.18642561e-03,
       3.29445655e-03, 2.66060215e-03, 3.27349404e-04, 1.62597701e-04,
       7.46015784e-03, 7.46607473e-03, 7.88481208e-04, 2.33002314e-02,
       7.38827600e-03, 1.91076950e-02, 4.36613709e-03, 3.45896714e-03,
       8.24042842e-04, 1.68885149e-03, 7.25749316e-03, 4.21521534e-02,
       2.55099280e-04])

In [61]:
best_clf.score(std_X_test, y_test)

0.9969390878481788

In [31]:
dump(best_clf, 'classifier.joblib')

['classifier.joblib']

In [32]:
test_array = np.zeros((15,15,28,28,3))

In [33]:
test_array.shape

(15, 15, 28, 28, 3)

In [34]:
test_array.reshape(225, 28, 28, 3).shape

(225, 28, 28, 3)