In [38]:
from pathlib import Path
from joblib import dump, load

import numpy as np
import pandas as pd

from skimage import img_as_float
from skimage.io import imread
from skimage.color import rgb2gray

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler  # Нужен для LR и SG. Без обрезки не нужен (результаты хуже)

from sklearn.svm import SVC  # 0.9975 Без обрезки: 0.9951
from sklearn.ensemble import RandomForestClassifier  # 0.9972 (вероятность) Быстро
from sklearn.neighbors import KNeighborsClassifier  # 0.9975 (вероятность) 
from sklearn.linear_model import LogisticRegression  # 0.9969 (вероятность) Быстро
from sklearn.linear_model import SGDClassifier  # 0.9969 Может обучаться онлайн. Быстро

In [50]:
letters_data = pd.DataFrame(columns=range(28 * 28 + 1))
letters_data = letters_data.rename(columns={784: 'letter'})

In [51]:
index = 0

for folder in range(1, 39):
    path_gen = Path(Path.cwd() / 'color_dataset' / str(folder)).glob('*.jpg')  # Создаем генератор путей картинок
    paths = [path for path in path_gen if path.is_file()]  # Записываем пути картинок
    for i in range(len(paths)):  
        letters_data.loc[index] = *np.around(img_as_float(rgb2gray(imread(paths[i]))).ravel(), decimals=2), folder
        # Картинка представляется 28*28 признаками (пикселями, в каждом из которых берем интенсивность белого)
        # Записываем информацию в датафрейм
        index += 1

In [52]:
letters_data.letter = letters_data.letter.map(int)  # Переводим номер буквы в int

In [53]:
letters_y = letters_data.pop('letter')  # Выделяем целевую переменную

In [54]:
X_train, X_test, y_train, y_test = train_test_split(letters_data, letters_y, test_size=0.3, random_state=1)
# Делим выборку на тренировочную и тестовую

In [55]:
scaler = StandardScaler()
std_X_train = scaler.fit_transform(X_train)
std_X_test = scaler.transform(X_test)
# Стандартизируем выборки (нужно не для всех методов). Без обрезки лучше не использовать.

In [8]:
svm_clf = SVC(kernel='poly', degree=2, C=1, cache_size=1000) #C=0.01, kernel='linear', degree=1, cache_size=1000, verbose=True)
# лучший SVC-классификатор

In [9]:
'''search = GridSearchCV(svm_clf, param_grid={'C': [0.00001, 0.001, 0.01, 0.1, 1, 100, 10000],
                                          'kernel': ['linear', 'poly'],
                                          'degree': range(1,4),
                                          },
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)'''

"search = GridSearchCV(svm_clf, param_grid={'C': [0.00001, 0.001, 0.01, 0.1, 1, 100, 10000],\n                                          'kernel': ['linear', 'poly'],\n                                          'degree': range(1,4),\n                                          },\n                       scoring='f1_macro',\n                     cv=KFold(n_splits=5, shuffle=True),\n                     n_jobs=-1, verbose=True)"

In [10]:
svm_clf.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=1000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
%%timeit
svm_clf.score(X_test, y_test)

4.78 s ± 48.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
# лучший RF-классификатор

In [13]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [14]:
%%timeit
rf_clf.score(X_test, y_test)

231 ms ± 4.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
knn_clf = KNeighborsClassifier(n_neighbors=6, weights='distance', n_jobs=-1)
# лучший kNN-классификатор

In [16]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=6, p=2,
                     weights='distance')

In [17]:
%%timeit
knn_clf.score(X_test, y_test)

6.23 s ± 88.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
# Для логистической регрессии лучше стандартизировать выборки.

In [19]:
search4 = GridSearchCV(lr_clf, param_grid={'C': [0.00001, 0.001, 0.01, 0.1, 1, 100, 10000]},
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)

In [20]:
search4.fit(std_X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   54.2s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=42, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1e-05, 0.001, 0.01, 0.1, 1, 100, 10000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=True)

In [21]:
search4.score(std_X_test, y_test)

0.9969688679896039

In [22]:
search4.best_params_

{'C': 100}

In [23]:
sg_clf = SGDClassifier(penalty='elasticnet', alpha=0.00001, random_state=42)

In [24]:
sg_clf.fit(std_X_train, y_train)

SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='elasticnet', power_t=0.5, random_state=42, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
sg_clf.score(std_X_test, y_test)

0.9966329966329966

In [56]:
best_clf = LogisticRegression(C=100, n_jobs=-1, random_state=42)

In [28]:
scaler2 = StandardScaler()
std_letters_data = scaler2.fit_transform(letters_data)
dump(scaler2, 'scaler.joblib')

['scaler.joblib']

In [57]:
best_clf.fit(std_X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [62]:
list(best_clf.predict_proba(X_test))[1]

array([9.17207715e-04, 9.95538174e-04, 1.53050475e-01, 1.00720897e-05,
       2.83775784e-03, 1.08727636e-03, 9.48280229e-03, 1.27757134e-03,
       4.85084003e-03, 1.07074904e-02, 6.81528607e-04, 3.38443972e-04,
       6.77978604e-01, 4.83489791e-05, 5.90951077e-04, 3.18642561e-03,
       3.29445655e-03, 2.66060215e-03, 3.27349404e-04, 1.62597701e-04,
       7.46015784e-03, 7.46607473e-03, 7.88481208e-04, 2.33002314e-02,
       7.38827600e-03, 1.91076950e-02, 4.36613709e-03, 3.45896714e-03,
       8.24042842e-04, 1.68885149e-03, 7.25749316e-03, 4.21521534e-02,
       2.55099280e-04])

In [61]:
best_clf.score(std_X_test, y_test)

0.9969390878481788

In [31]:
dump(best_clf, 'classifier.joblib')

['classifier.joblib']

In [32]:
test_array = np.zeros((15,15,28,28,3))

In [33]:
test_array.shape

(15, 15, 28, 28, 3)

In [34]:
test_array.reshape(225, 28, 28, 3).shape

(225, 28, 28, 3)