## Импорт необходимых модулей и определение констант

In [None]:
import os
import shutil

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from skimage.io import imread

from keras.preprocessing.image import ImageDataGenerator, array_to_img
from keras.models import Sequential
from keras.layers import Dense, Flatten, BatchNormalization, Activation
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.constraints import maxnorm

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from scikeras.wrappers import KerasClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import decomposition
from sklearn.cluster import KMeans


In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)


In [None]:
SEED = 21
np.random.seed(SEED)
IMAGE_PATH = "./data/Raw/"
plt.style.use('fivethirtyeight')


## Загрузка набора данных и предварительный анализ

### Функции отрисовки

Отрисовка нескольких изображений по пути файла

In [None]:
def draw_pictures(img_df):
    n = len(img_df)
    cols = 5
    if n < cols:
        cols = n
    rows = n//cols+1 if (n/cols > n//cols) else n//cols
    plt.subplots(squeeze=False, figsize=(10, 10), constrained_layout=True)
    plt.grid(False)
    for i in range(n):
        plt.subplot2grid((rows, cols), (i//cols, i % cols)
                         ).imshow(imread(img_df['path'][i]))
        plt.title(img_df['img_label'][i])
    plt.show()


Отрисовка одного изображения с его цветовой гистограммой

In [None]:
def draw_image_with_hist(image):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5), constrained_layout=True)
    axes[0].imshow(array_to_img(image))
    axes[1].hist(image.ravel(), bins=256, color='orange')
    axes[1].hist(image[:, :, 0].ravel(), bins=256, color='red', alpha=0.5)
    axes[1].hist(image[:, :, 1].ravel(), bins=256, color='Green', alpha=0.5)
    axes[1].hist(image[:, :, 2].ravel(), bins=256, color='Blue', alpha=0.5)
    axes[1].set_xlabel('Интенсивность')
    axes[1].set_ylabel('Количество')
    axes[1].legend(['Общая', 'Красный канал', 'Зелёный канал', 'Синий канал'])
    plt.show()


Отрисовка нескольких изображений с их цветовыми гистограммами

In [None]:
def draw_pictures_with_hists(img_df):
    n = len(img_df)
    cols = 5
    if n < cols:
        cols = n
    rows = n//cols+1 if (n/cols > n//cols) else n//cols
    fig, axes = plt.subplots(
        rows*2, cols, figsize=(cols*5, rows*10), constrained_layout=True)
    fig.suptitle('Цветовая гистограмма', fontsize=21)
    row_i = 0
    for i in range(n):
        image = imread(img_df['path'][i])
        axes[row_i, i % cols].hist(image.ravel(), bins=256, color='orange', )
        axes[row_i, i % cols].hist(
            image[:, :, 0].ravel(), bins=256, color='red', alpha=0.5)
        axes[row_i, i % cols].hist(
            image[:, :, 1].ravel(), bins=256, color='green', alpha=0.5)
        axes[row_i, i % cols].hist(
            image[:, :, 2].ravel(), bins=256, color='blue', alpha=0.5)
        axes[row_i, i % cols].legend(
            ['Общая', 'Красный канал', 'Зелёный канал', 'Синий канал'])
        axes[row_i, i % cols].set_xlabel('Интенсивность')
        axes[row_i, i % cols].set_ylabel('Количество')
        axes[row_i+1, i % cols].imshow(image)
        if (i % cols+1 == cols):
            row_i += 2


### Загрузка данных для датасета

Функция выделения признаков из датасета

In [None]:
def img_params(img_path):
    img = imread(img_path)
    img_class = img_path.replace(IMAGE_PATH, '').split('\\')[0]
    img_color = np.sum(img.reshape(-1, 3)*np.asarray([65536, 256, 1]), axis=1)
    return classes_dict[img_class.lower()], img_class, img_path, img.shape[0], img.shape[1], img.shape[2], img_color.max(), img_color.min(), img_color.mean(), img_color.std(), np.median(img_color)


In [None]:
image_files = []
classes_dict = {}
for k, dir in enumerate(os.listdir(IMAGE_PATH)):
    classes_dict[dir.lower()] = k
    image_files.extend(os.path.join(IMAGE_PATH, dir, file) for file in os.listdir(
        os.path.join(IMAGE_PATH, dir)) if file.endswith(('.JPG', '.jpg', '.png')))


Пронумерованные классы датасета

In [None]:
for class_title, class_number in classes_dict.items():
    print(f"Класс: \033[1m{class_title}\033[0m - номер {class_number}")


Создание датасета признаков

In [None]:
images_data = np.array(list(map(img_params, np.asarray(image_files))))
df = pd.DataFrame(columns=['img_class', 'img_label', 'path', 'height', 'width', 'dimension',
                  'max_rgb', 'min_rgb', 'mean_rgb', 'std_rgb', 'median_rgb'], data=images_data)
df = df.astype({'height': 'int32', 'width': 'int32', 'dimension': 'int32', 'max_rgb': 'float32',
                'min_rgb': 'float32', 'mean_rgb': 'float32', 'std_rgb': 'float32', 'median_rgb': 'float32'})


Состав полей датасета:
- **img_class** - класс, к которому относится изображение
- **img_label** - название класса, к которому относится изображение
- **path** - путь к файлу изображения
- **height** - высота изображения
- **width** - ширина изображения
- **dimension** - количество цветовых каналов изображения
- **max_rgb** - максимальное значение пикселя в изображении
- **min_rgb** - минимальное значение пикселя в изображении
- **mean_rgb** - среднее значение пикселя в изображении
- **std_rgb**  - стандартное отклонение значения пикселя в изображении
- **median_rgb** - медианное значение пикселя в изображении

In [None]:
print(f'Количество фотографий в датасете: {df.shape[0]}')
print('Количество отсутствующих значений в датасете и типы данных по столбцам:\n')
print(df.info())


### Несколько точек данных

In [None]:
print('Первые 5 строчек датасета:')
df.head()


In [None]:
print('Первые 5 изображений датасета:')
draw_pictures(df[:5])


In [None]:
print('Изображения экземпляров разных классов:')
draw_pictures(df.groupby('img_class').min())


## Описательный анализ данных

### Описание численных типов данных

In [None]:
df.describe()


#### Визуализация численных данных

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle(
    'Визуализация шкалы измерения и аномальных значений высоты и ширины изображений')
sns.boxplot(ax=axes[0], y="height", data=df, color='green')
sns.boxplot(ax=axes[1], y="width", data=df)
plt.show()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle(
    'Визуализация шкалы измерения и аномальных значений высоты и ширины изображений в соответствии с классом')
sns.boxplot(ax=axes[0], x="img_label", y="height", data=df)
sns.boxplot(ax=axes[1], x="img_label", y="width", data=df)
plt.show()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('Визуализация распределения высоты и ширины изображения ')
sns.violinplot(ax=axes[0], y=df['height'], color='purple')
sns.violinplot(ax=axes[1], y=df['width'])
plt.show()


In [None]:
f, ax = plt.subplots(figsize=(25, 7))
f.suptitle(
    'Графики максимального, минимального, медианного и среднего RGB значения изображения')
sns.lineplot(label='Максимальное', data=df['max_rgb'], lw=2)
sns.lineplot(label='Медиана', data=df['median_rgb'], lw=2)
sns.lineplot(label='Среднее', data=df['mean_rgb'], lw=2)
sns.lineplot(label='Минимальное', data=df['min_rgb'], lw=2)
plt.fill_between(df.index, df['min_rgb'], df['max_rgb'],
                 edgecolor='none', facecolor='blue', alpha=0.1)
plt.legend(fontsize=12, loc='upper left')
plt.xlabel('Номер изображения')
plt.ylabel('RGB значение пикселя')
plt.show()


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 10))
fig.suptitle('Визуализация распределения цвета фотографий')
sns.histplot(ax=axes[0, 0], data=df, x="max_rgb", hue="img_label",
             kde=True, bins=10, line_kws=dict(linewidth=2))
sns.histplot(ax=axes[0, 1], data=df, x="min_rgb", hue="img_label",
             kde=True, bins=10, line_kws=dict(linewidth=2))
sns.histplot(ax=axes[1, 0], data=df, x="mean_rgb", hue="img_label",
             kde=True, bins=10, line_kws=dict(linewidth=2))
sns.histplot(ax=axes[1, 1], data=df, x="median_rgb",
             hue="img_label", kde=True, bins=10, line_kws=dict(linewidth=2))
plt.show()


In [None]:
def number_to_color(number):
    return '#'+hex(int(number)).replace('0x', '').rjust(6, '0')


In [None]:
df_decribe = df.describe()
cols = df_decribe.columns[3:].drop('std_rgb')
params = ['max', '50%', 'min']
n = 2
fig, axes = plt.subplots(len(cols), len(
    params), figsize=(n*4, n*4), constrained_layout=True)
fig.suptitle('Основные цвета фотографий')
for i, label in enumerate(cols):
    for j, param in enumerate(params):
        axes[i, j].fill([0, 0, n, n], [0, n, n, 0],
                        number_to_color(df_decribe[label][param]))
        axes[i, j].axis('off')
        axes[i, j].set_title(f"{label} {param}", fontsize=12)
plt.show()


In [None]:
f, ax = plt.subplots(figsize=(15, 5))
f.suptitle('График среднего RGB значения изображения со стандартным отклонением')
sns.lineplot(label='Среднее', data=df['mean_rgb'], color='blue', lw=2)
plt.fill_between(df.index, df['mean_rgb'] - df['std_rgb'], df['mean_rgb'] + df['std_rgb'], edgecolor='none',
                 facecolor='blue', alpha=0.2)
plt.legend(fontsize=10, loc='upper left')
plt.xlabel('Номер изображения')
plt.ylabel('RGB значение пикселя')
plt.show()


In [None]:
plt.rcParams['figure.figsize'] = (8, 7)
plt.title("Корреляционная матрица признаков датасета")
sns.heatmap(df.drop('dimension', axis=1).corr())
plt.show()


In [None]:
plt.subplots(figsize=(16, 10), constrained_layout=True)
for i, col in enumerate(df.columns[3:]):
    plt.subplot(2, 4, i+1)
    plt.scatter(df[col], df['img_label'])
    plt.title(col)


### Описание категориальных типов данных

In [None]:
df.describe(include=['O'])


Распределение данных внутри датасета

In [None]:
for class_title, class_number in classes_dict.items():
    print(
        f"Класс номер {class_number} {class_title}: {len(df[df['img_class'] == str(class_number)])} ({len(df[df['img_class'] == str(class_number)])/len(df)*100:.2f}%)")
print(f"Всего: {len(df)}")


In [None]:
plt.figure(figsize=(5, 5))
df.groupby('img_label').count()['img_class'].plot(kind='pie', autopct='%1.2f%%',
                                                  startangle=270, fontsize=12, title="Распределение данных внутри датасета")
plt.show()


### Цветовые гистограммы по трём каналам

In [None]:
draw_pictures_with_hists(df[:10])


## Предобработка данных

### Добавление экземпляров

Функция изменения размера датасета до определенного количества экземпляров каждого класса 

In [None]:
def balance_data_size(train_generator, max_items=100, suf_dir=''):
    working_dir = os.path.join(os.path.relpath(os.path.join(
        train_generator.directory, "../")), f'Work_{suf_dir}')
    if os.path.isdir(working_dir):
        shutil.rmtree(working_dir)
    os.mkdir(working_dir)
    labels, counts = np.unique(train_generator.labels, return_counts=True)
    img_size = train_generator.target_size
    batch_size = train_generator.batch_size
    rotation = 45
    datagen = ImageDataGenerator(rotation_range=rotation,
                                 width_shift_range=0.2,
                                 height_shift_range=0.2,
                                 shear_range=0.1,
                                 zoom_range=0.2,
                                 horizontal_flip=True,
                                 vertical_flip=True,
                                 fill_mode="constant",
                                 validation_split=0.2
                                 )
    subset = 'validation' if suf_dir == 'test' else 'training'
    for label in train_generator.class_indices.keys():
        items_count = counts[np.where(
            labels == train_generator.class_indices[label])[0][0]]
        start_items = max_items
        if items_count < max_items:
            os.mkdir(os.path.join(working_dir, label))
            datagen_new = datagen.flow_from_directory(train_generator.directory,
                                                      subset=subset,
                                                      target_size=(256, 256),
                                                      batch_size=1,
                                                      seed=SEED,
                                                      class_mode='categorical',
                                                      save_to_dir=os.path.join(
                                                          working_dir, label),
                                                      classes=[label],
                                                      save_prefix=f"new_{label}",
                                                      save_format="jpeg")

            for _ in range(max_items - items_count):
                next(datagen_new)
            start_items = items_count
            print(
                f"Добавлено {max_items - items_count} элеметов в класс {label}\n")
        else:
            print(f"Выбрано {max_items} элеметов из класса {label}\n")
        files = list(map(lambda x: x if label == x.split(
            '\\')[0] else '', train_generator.filenames))
        files = [x for x in files if x]

        for i in range(start_items):
            os.makedirs(os.path.dirname(os.path.join(
                working_dir, label+'\\')), exist_ok=True)
            shutil.copy(os.path.join(
                train_generator.directory, files[i]), os.path.join(working_dir, label))

    train_generator = ImageDataGenerator(rescale=1. / 255.).flow_from_directory(working_dir,
                                                                                target_size=(
                                                                                    256, 256),
                                                                                batch_size=batch_size,
                                                                                shuffle=False,
                                                                                class_mode='categorical',
                                                                                color_mode='rgb')

    return train_generator, working_dir


Процедура добавления изменённых экземпляров

In [None]:
def add_data_transformed(image_path, classes_dict, max_add_items):
    rotation = 30
    datagen = ImageDataGenerator(rotation_range=rotation,
                                 width_shift_range=0.2,
                                 height_shift_range=0.2,
                                 shear_range=0.1,
                                 zoom_range=0.2,
                                 horizontal_flip=True,
                                 vertical_flip=True,
                                 fill_mode="nearest"
                                 )
    for label in classes_dict.keys():
        datagen_new = datagen.flow_from_directory(image_path,
                                                  batch_size=1,
                                                  seed=SEED,
                                                  class_mode='categorical',
                                                  save_to_dir=image_path + label,
                                                  classes=[label],
                                                  save_prefix=f"added_{label}",
                                                  save_format="jpeg")

        if max_add_items is None:
            max_add_items = len(datagen_new)*round((360/rotation))
        for _ in range(max_add_items):
            next(datagen_new)


Расширение датасета 

In [None]:
#add_data_transformed(IMAGE_PATH, classes_dict, 250)


### Обработка и разделение данных 

In [None]:
img_size = (128, 128)
batch_size = 128
datagen = ImageDataGenerator(
    rescale=1. / 255.,
    samplewise_center=True,
    samplewise_std_normalization=True,
    validation_split=0.2)
train_generator = datagen.flow_from_directory(IMAGE_PATH,
                                              subset='training',
                                              target_size=img_size,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              class_mode='categorical',
                                              color_mode='rgb')
test_generator = datagen.flow_from_directory(IMAGE_PATH,
                                             subset='validation',
                                             target_size=img_size,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             class_mode='categorical',
                                             color_mode='rgb')
classes_number = train_generator.num_classes


Балансировка количества экземпляров в каждом классе обучающей выборки

In [None]:
train_iterator, working_dir = balance_data_size(train_generator, 800, 'train')


In [None]:
samples_per_classes = int(
    round(train_generator.samples/train_generator.num_classes+50, -2))
print(f"Количество экземляров на каждый класс: {samples_per_classes*10}")


In [None]:
train_iterator2, working_dir = balance_data_size(
    train_generator, samples_per_classes*10, 'train2')


In [None]:
test_iterator, working_dir = balance_data_size(test_generator, 200, 'test')


Преобразование генераторов в плоские массивы

In [None]:
y_test = test_iterator.classes
y_train = train_iterator.classes

x_test = np.concatenate([test_iterator.next()[0] for _ in range(
    test_iterator.__len__())]).reshape(y_test.shape[0], -1)
x_train = np.concatenate([train_iterator.next()[0] for _ in range(
    train_iterator.__len__())]).reshape(y_train.shape[0], -1)


Разделение на обучающую и валидационную выборки

In [None]:
train_df = pd.DataFrame(data={'path': train_iterator2.filepaths, 'label': list(
    map(lambda x: x.split('\\')[0], train_iterator2.filenames))})
train_df, valid_df = train_test_split(
    train_df, train_size=0.8, shuffle=True, random_state=SEED, stratify=train_df['label'])
train_iterator_df = ImageDataGenerator(rescale=1. / 255.).flow_from_dataframe(train_df,
                                                                              x_col='path',
                                                                              y_col='label',
                                                                              target_size=(
                                                                                  256, 256),
                                                                              class_mode='categorical',
                                                                              batch_size=128,
                                                                              shuffle=False,
                                                                              color_mode='rgb')
valididation_iterator_df = ImageDataGenerator(rescale=1. / 255.).flow_from_dataframe(valid_df,
                                                                                     x_col='path',
                                                                                     y_col='label',
                                                                                     target_size=(
                                                                                         256, 256),
                                                                                     class_mode='categorical',
                                                                                     batch_size=128,
                                                                                     shuffle=False,
                                                                                     color_mode='rgb')


Пример изображения и его гистограммы после обработки

In [None]:
draw_image_with_hist(train_iterator[0][0][0])


### Понижение размерности

In [None]:
pca = decomposition.PCA(n_components=2)
x_reduced = pca.fit_transform(x_train)

print('Projecting %d-dimensional data to 2D' % x_train.shape[1])

plt.figure(figsize=(8, 6))
plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=y_train,
            edgecolor='none', alpha=0.7, s=40,
            cmap=plt.cm.get_cmap('nipy_spectral', 10))
plt.colorbar()
plt.title('PCA projection')
plt.show()


In [None]:
pca = decomposition.PCA().fit(x_train)


In [None]:
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), color='k', lw=2)
plt.xlabel('Количество признаков')
plt.ylabel('Итоговая дисперсия')
plt.xlim(0, 800)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.axvline(300, c='b')
plt.axhline(0.95, c='r')
plt.show()


In [None]:
pca = decomposition.PCA(0.95).fit(x_train)
x_train_reduced = pca.transform(x_train)
x_test_reduced = pca.transform(x_test)


## Моделирование

Разбиение данных для кросс-валидации

In [None]:
kfold = RepeatedKFold(n_splits=5, n_repeats=1,  random_state=SEED)


### Функции для моделирования

Отрисовка матрицы неточностей

In [None]:
def draw_heat(y_test, y_pred):
    class_names = y_test
    fig, ax = plt.subplots(figsize=(10, 8))
    plt.title('Матрица неточностей')
    ticks = np.arange(len(class_names))
    plt.xticks(ticks, class_names)
    plt.yticks(ticks, class_names)
    sns.heatmap(pd.DataFrame(
        confusion_matrix(y_test, y_pred)),
        annot=True)
    plt.ylabel('Действительные значения')
    plt.xlabel('Предсказанные значения')


In [None]:
def print_report(y_test, y_pred, classes_dict):
    classes = list(classes_dict.keys())
    length = len(classes)
    if length < 8:
        fig_width = 8
        fig_height = 8
    else:
        fig_width = int(length * .5)
        fig_height = int(length * .5)
    plt.style.use('fivethirtyeight')
    plt.figure(figsize=(fig_width, fig_height))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True,
                vmin=0, fmt='g', cmap='RdPu', cbar=True)
    plt.xticks(np.arange(length)+.5, classes, rotation=90)
    plt.yticks(np.arange(length)+.5, classes, rotation=0)
    plt.xlabel("Предсказанные значения")
    plt.ylabel("Действительные значения")
    plt.title("Матрица неточностей")
    plt.show()
    class_rep = classification_report(y_test, y_pred, target_names=classes)
    print("Classification Report:\n----------------------\n", class_rep)


Отрисовка графика с ошибкой

In [None]:
def plot_with_err(x, data, **kwargs):
    data_mean, data_std = data.mean(axis=1), data.std(axis=1)
    lines = plt.plot(x, data_mean, '-', **kwargs)
    plt.fill_between(x, data_mean - data_std, data_mean + data_std, edgecolor='none',
                     facecolor=lines[0].get_color(), alpha=0.2)


Отрисовка обучающих кривых

In [None]:
def pooled_variance(stds, n=5):
    return np.sqrt(sum((n-1)*(stds**2)) / len(stds)*(n-1))


def print_learning_curve(grid_result, parameters):
    df = pd.DataFrame(grid_result.cv_results_)
    results = ['mean_test_score',
               'mean_train_score',
               'std_test_score',
               'std_train_score']
    fig, axes = plt.subplots(1, len(parameters), figsize=(
        5*len(parameters), 7))
    axes[0].set_ylabel("Score")

    for i, (param_name, param_range) in enumerate(parameters.items()):
        grouped_df = df.groupby(f'param_{param_name}')[results]\
            .agg({'mean_train_score': np.mean,
                  'mean_test_score': np.mean,
                  'std_train_score': pooled_variance,
                  'std_test_score': pooled_variance})
        axes[i].set_xlabel(param_name)
        if isinstance(parameters[param_name][0], str):
            x = np.arange(len(param_range))
            width = 0.35
            axes[i].bar(x-width/2,
                        grouped_df['mean_train_score'],
                        width,
                        yerr=[grouped_df['mean_train_score'] - grouped_df['std_train_score'],
                              grouped_df['mean_train_score'] + grouped_df['std_train_score']],
                        color="red",
                        error_kw={'elinewidth': 1, 'capsize': 6},
                        label="Training score")
            axes[i].bar(x + width/2,
                        grouped_df['mean_test_score'],
                        width,
                        yerr=[grouped_df['mean_test_score'] - grouped_df['std_test_score'],
                              grouped_df['mean_test_score'] + grouped_df['std_test_score']],
                        color="green",
                        alpha=0.6,
                        error_kw={'elinewidth': 1, 'capsize': 10},
                        label="Cross-validation score")
            axes[i].set_xticks(x)
            axes[i].set_xticklabels(param_range)
        else:
            #axes[i].set_ylim(0.0, 1.1)
            axes[i].plot(param_range,
                         grouped_df['mean_train_score'],
                         label="Training score",
                         color="red",
                         lw=2)
            axes[i].fill_between(param_range,
                                 grouped_df['mean_train_score'] -
                                 grouped_df['std_train_score'],
                                 grouped_df['mean_train_score'] +
                                 grouped_df['std_train_score'],
                                 alpha=0.2,
                                 color="red",
                                 lw=2)
            axes[i].plot(param_range,
                         grouped_df['mean_test_score'],
                         label="Cross-validation score",
                         color="green",
                         lw=2)
            axes[i].fill_between(param_range,
                                 grouped_df['mean_test_score'] -
                                 grouped_df['std_test_score'],
                                 grouped_df['mean_test_score'] +
                                 grouped_df['std_test_score'],
                                 alpha=0.2,
                                 color="green",
                                 lw=2)

    handles, labels = axes[0].get_legend_handles_labels()
    fig.suptitle('Learning curves', fontsize=25)
    fig.legend(handles, labels, loc=8, ncol=2, fontsize=15)

    fig.subplots_adjust(bottom=0.25, top=0.85)
    plt.show()


Функция отчёта grid search

In [None]:
def print_grid_report(grid_result):
    print(
        f"Best: {grid_result.best_score_:.4f} using { grid_result.best_params_}")
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print(f"{mean:.4f} ({stdev:.4f}) with: {param}")


### Обучение без учителя

In [None]:
% % time
kmeans = KMeans(n_clusters=3, random_state=2).fit(x_train_reduced)
kmeans_pred = kmeans.predict(x_test_reduced)


In [None]:
print_report(y_test, kmeans_pred, classes_dict)


### Логистическая регрессия 

In [None]:
% % time
params_logreg = {'solver': ['lbfgs', 'sag',
                            'saga'], 'C': np.logspace(-3, 3, 7)}
logreg = LogisticRegression()
grid_search_logreg = GridSearchCV(estimator=logreg,
                                  param_grid=params_logreg,
                                  scoring='f1_weighted',
                                  cv=kfold,
                                  return_train_score=True,
                                  verbose=0)

grid_result_logreg = grid_search_logreg.fit(x_train_reduced, y_train)


In [None]:
print_grid_report(grid_search_logreg)


In [None]:
y_pred_logreg = grid_result_logreg.predict(x_test_reduced)


In [None]:
print_report(y_test, y_pred_logreg, classes_dict)


Обучающие кривые

In [None]:
print_learning_curve(grid_result_logreg, params_logreg)


### Метод опорных векторов 

In [None]:
% % time
params_svc = {'gamma': [0.01, 0.001], 'kernel': [
    'rbf', 'poly', 'sigmoid'], 'C': [1, 10, 100, 1000]}
svc = SVC()
grid_search_svc = GridSearchCV(estimator=svc,
                               param_grid=params_svc,
                               scoring='f1_weighted',
                               cv=kfold,
                               return_train_score=True,
                               verbose=0)

grid_result_svc = grid_search_svc.fit(x_train_reduced, y_train)


In [None]:
print_grid_report(grid_result_svc)


In [None]:
y_pred_svc = grid_result_svc.predict(x_test_reduced)


In [None]:
print_report(y_test, y_pred_svc, classes_dict)


Обучающие кривые

In [None]:
print_learning_curve(grid_result_svc, params_svc)


### Метод k ближайших соседей 

In [None]:
% % time
params_kneighbors = {'n_neighbors': list(range(1, round(
    x_train_reduced.shape[0]**(1/2)), 4)), 'weights': ['uniform', 'distance']}
kneighbors = KNeighborsClassifier(n_neighbors=5)
grid_search_kneighbors = GridSearchCV(estimator=kneighbors,
                                      param_grid=params_kneighbors,
                                      scoring='f1_weighted',
                                      cv=kfold,
                                      return_train_score=True,
                                      verbose=0)

grid_result_kneighbors = grid_search_kneighbors.fit(x_train_reduced, y_train)


In [None]:
print_grid_report(grid_result_kneighbors)


In [None]:
y_pred_kneighbors = grid_result_kneighbors.predict(x_test_reduced)


In [None]:
print_report(y_test, y_pred_kneighbors, classes_dict)


In [None]:
print_learning_curve(grid_result_kneighbors, params_kneighbors)


### Свёрточная нейросеть

#### Grid search

In [None]:
def create_model(neurons):
    model = Sequential()
    model.add(Conv2D(
        2, (3, 3), input_shape=train_iterator2[0][0].shape[1:], padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(4, (3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())
    model.add(Conv2D(neurons, (3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(16, kernel_constraint=maxnorm(3),
              activation='relu', kernel_regularizer='l2'))
    model.add(BatchNormalization())
    model.add(Dense(classes_number))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
model = KerasClassifier(model=create_model, neurons=2, verbose=0)

batch_size = [10, 20, 40, 60, 80, 100]
epochs = [5, 10, 20, 25]
optimizer = ['SGD', 'RMSprop', 'Adagrad',
             'Adadelta', 'Adam', 'Adamax', 'Nadam']
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero',
             'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
activation = ['softmax', 'softplus', 'softsign',
              'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
neurons = [2, 4, 8, 16, 32, 64, 128]
weight_constraint = [1, 2, 3, 4, 5]
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

param_grid = dict(neurons=neurons)
grid = GridSearchCV(model, param_grid=param_grid, cv=kfold)
grid_result = grid.fit(x_train, y_train, epochs=40, batch_size=40)

print(f"Best: {grid_result.best_score_} using { grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{mean} ({stdev}) with: {param}")


#### Модель

In [None]:
epochs = 16
batch_size = 5
model = Sequential()
model.add(Conv2D(
    2, (3, 3), input_shape=train_iterator2[0][0].shape[1:], padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(4, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
model.add(Conv2D(8, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(16, kernel_constraint=maxnorm(3),
          activation='relu', kernel_regularizer='l2'))
model.add(BatchNormalization())
model.add(Dense(classes_number))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])


In [None]:
% % time
history = model.fit(train_iterator_df, validation_data=valididation_iterator_df,
                    epochs=epochs, batch_size=batch_size)


In [None]:
prediction = model.predict(test_iterator)


In [None]:
print_report(test_iterator.classes, prediction.argmax(axis=1), classes_dict)


In [None]:
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']

epochs = range(1, len(loss_values) + 1)
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

ax[0].plot(epochs, accuracy, 'g', label='Training accuracy')
ax[0].plot(epochs, val_accuracy, 'b', label='Validation accuracy')
ax[0].set_title('Training & Validation Accuracy', fontsize=16)
ax[0].set_xlabel('Epochs', fontsize=16)
ax[0].set_ylabel('Accuracy', fontsize=16)
ax[0].legend()

ax[1].plot(epochs, loss_values, 'g', label='Training loss')
ax[1].plot(epochs, val_loss_values, 'b', label='Validation loss')
ax[1].set_title('Training & Validation Loss', fontsize=16)
ax[1].set_xlabel('Epochs', fontsize=16)
ax[1].set_ylabel('Loss', fontsize=16)
ax[1].legend()
plt.show()
