In [1]:
from pathlib import Path
from joblib import dump, load

import numpy as np
import pandas as pd

from skimage import img_as_float
from skimage.io import imread
from skimage.color import rgb2gray

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler  # Нужен для LR и SG. Без обрезки не нужен (результаты хуже)

from sklearn.svm import SVC  # 0.9975 Без обрезки: 0.9951
from sklearn.ensemble import RandomForestClassifier  # 0.9972 (вероятность) Быстро
from sklearn.neighbors import KNeighborsClassifier  # 0.9975 (вероятность) 
from sklearn.linear_model import LogisticRegression  # 0.9969 (вероятность) Быстро
from sklearn.linear_model import SGDClassifier  # 0.9969 Может обучаться онлайн. Быстро

In [2]:
letters_data = pd.DataFrame(columns=range(28 * 28 + 1))
letters_data = letters_data.rename(columns={784: 'letter'})

In [3]:
index = 0

for folder in range(1, 35):
    path_gen = Path(Path.cwd() / 'color_dataset' / str(folder)).glob('*.jpg')  # Создаем генератор путей картинок
    paths = [path for path in path_gen if path.is_file()]  # Записываем пути картинок
    for i in range(len(paths)):  
        letters_data.loc[index] = *np.around(img_as_float(rgb2gray(imread(paths[i]))).ravel(), decimals=2), folder
        # Картинка представляется 28*28 признаками (пикселями, в каждом из которых берем интенсивность белого)
        # Записываем информацию в датафрейм
        index += 1

In [4]:
letters_data.letter = letters_data.letter.map(int)  # Переводим номер буквы в int

In [5]:
letters_y = letters_data.pop('letter')  # Выделяем целевую переменную

In [6]:
X_train, X_test, y_train, y_test = train_test_split(letters_data, letters_y, test_size=0.05, random_state=1)
# Делим выборку на тренировочную и тестовую

In [None]:
scaler = StandardScaler()
std_X_train = scaler.fit_transform(X_train)
std_X_test = scaler.transform(X_test)

dump(scaler, 'scaler.joblib')
# Стандартизируем выборки (нужно не для всех методов). Без обрезки лучше не использовать.

In [120]:
svm_clf = SVC(kernel='poly', degree=2, C=1, cache_size=1000) #C=0.01, kernel='linear', degree=1, cache_size=1000, verbose=True)
# лучший SVC-классификатор

TypeError: __init__() got an unexpected keyword argument 'n_jobs'

In [108]:
'''search = GridSearchCV(svm_clf, param_grid={'C': [0.00001, 0.001, 0.01, 0.1, 1, 100, 10000],
                                          'kernel': ['linear', 'poly'],
                                          'degree': range(1,4),
                                          },
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)'''

"search = GridSearchCV(svm_clf, param_grid={'C': [0.00001, 0.001, 0.01, 0.1, 1, 100, 10000],\n                                          'kernel': ['linear', 'poly'],\n                                          'degree': range(1,4),\n                                          },\n                       scoring='f1_macro',\n                     cv=KFold(n_splits=5, shuffle=True),\n                     n_jobs=-1, verbose=True)"

In [111]:
svm_clf.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=1000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [113]:
%%timeit
svm_clf.score(X_test, y_test)

9.5 s ± 272 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [121]:
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
# лучший RF-классификатор

In [122]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [123]:
%%timeit
rf_clf.score(X_test, y_test)

211 ms ± 66.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [117]:
knn_clf = KNeighborsClassifier(n_neighbors=6, weights='distance', n_jobs=-1)
# лучший kNN-классификатор

In [118]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=6, p=2,
                     weights='distance')

In [140]:
%%timeit
knn_clf.score(X_test, y_test)

4.97 s ± 252 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [129]:
lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
# Для логистической регрессии лучше стандартизировать выборки.

In [130]:
search4 = GridSearchCV(lr_clf, param_grid={'C': [0.00001, 0.001, 0.01, 0.1, 1, 100, 10000]},
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)

In [131]:
search4.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   41.6s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=42, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1e-05, 0.001, 0.01, 0.1, 1, 100, 10000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=True)

In [132]:
search4.score(X_test, y_test)

0.9970282624557019

In [133]:
search4.best_params_

{'C': 100}

In [134]:
sg_clf = SGDClassifier(penalty='elasticnet', alpha=0.00001, random_state=42)

In [135]:
sg_clf.fit(std_X_train, y_train)

11.3 s ± 167 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [142]:
sg_clf.score(std_X_test, y_test)

0.9966329966329966

In [4]:
dump(best_clf, 'classifier.joblib')

NameError: name 'best_clf' is not defined

In [3]:
test_array = np.zeros((15,15,28,28,3))

In [4]:
test_array.shape

(15, 15, 28, 28, 3)

In [6]:
test_array.reshape(225, 784, 3).shape

(225, 784, 3)