In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from skimage import img_as_float
from skimage.io import imread

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler  # Нужен для LR и SG. Без обрезки не нужен

from sklearn.svm import SVC  # 0.9975 Без обрезки: 0.9951
from sklearn.ensemble import RandomForestClassifier  # 0.9972 (вероятность) Без обрезки: 0.9957
from sklearn.neighbors import KNeighborsClassifier  # 0.9975 (вероятность) Нет обучения. Без обрезки: 0.996
from sklearn.linear_model import LogisticRegression  # 0.9969 (вероятность) Без обрезки: 0.9943
from sklearn.linear_model import SGDClassifier  # 0.9969 Может обучаться онлайн. Без обрезки: 0.9914

In [2]:
letters_data = pd.DataFrame(columns=range(28 * 28 + 1))
letters_data = letters_data.rename(columns={784: 'letter'})

In [3]:
index = 0

for folder in range(1, 34):
    path_gen = Path(Path.cwd() / 'threshold_dataset' / str(folder)).glob('*.jpg')  # Создаем генератор путей картинок
    paths = [path for path in path_gen if path.is_file()]  # Записываем пути картинок
    for i in range(len(paths)):  
        letters_data.loc[index] = *np.around(img_as_float(imread(paths[i])).ravel(), decimals=2), folder
        # Картинка представляется 28*28 признаками (пикселями, в каждом из которых берем интенсивность белого)
        # Записываем информацию в датафрейм
        index += 1

In [4]:
letters_data.letter = letters_data.letter.map(int)  # Переводим номер буквы в int

In [5]:
letters_y = letters_data.pop('letter')  # Выделяем целевую переменную

In [6]:
X_train, X_test, y_train, y_test = train_test_split(letters_data, letters_y, test_size=0.3, random_state=42)
# Делим выборку на тренировочную и тестовую

In [7]:
scaler = StandardScaler()
std_X_train = scaler.fit_transform(X_train)
std_X_test = scaler.transform(X_test)
# Стандартизируем выборки (нужно не для всех методов). Без обрезки лучше не использовать.

In [8]:
svm_clf = SVC(C=0.01, kernel='linear', degree=1, cache_size=1000, verbose=True)
# лучший SVC-классификатор

In [9]:
svm_clf.fit(std_X_train, y_train)

[LibSVM]

SVC(C=0.01, break_ties=False, cache_size=1000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

In [10]:
svm_clf.score(std_X_test, y_test)

0.995102540557086

In [11]:
rf_clf = RandomForestClassifier(n_estimators=100,random_state=42)
# лучший RF-классификатор

In [12]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [13]:
rf_clf.score(X_test, y_test)

0.9957147229874502

In [14]:
knn_clf = KNeighborsClassifier(n_neighbors=6, weights='distance')
# лучший kNN-классификатор

In [15]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='distance')

In [16]:
knn_clf.score(X_test, y_test)

0.9960208142026323

In [17]:
lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
# Для логистической регрессии лучше стандартизировать выборки.

In [18]:
search4 = GridSearchCV(lr_clf, param_grid={'C': np.arange(0.001, 1.001, 0.1)},
                       scoring='f1_macro',
                     cv=KFold(n_splits=5, shuffle=True),
                     n_jobs=-1, verbose=True)

In [19]:
search4.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   59.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.3min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=42, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([0.001, 0.101, 0.201, 0.301, 0.401, 0.501, 0.601, 0.701, 0.801,
       0.901])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=True)

In [20]:
search4.score(X_test, y_test)

0.9943996419962042

In [28]:
sg_clf = SGDClassifier(penalty='elasticnet', random_state=42)

In [29]:
sg_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='elasticnet', power_t=0.5, random_state=42, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [30]:
sg_clf.score(X_test, y_test)

0.9914294459749006