# [Уменьшение количества цветов изображения](https://www.coursera.org/learn/vvedenie-mashinnoe-obuchenie/programming/yWVid/umien-shieniie-kolichiestva-tsvietov-izobrazhieniia)

## Введение

Самый распространенный тип задач машинного обучения — это задачи обучения с учителем. В них имеется обучающая выборка, для каждого объекта которой есть ответ, и нужно научиться предсказывать эти ответы для новых объектов. В такой постановке можно строго определить критерии качества.

Если же имеются лишь объекты, а ответов для них нет, то все равно можно пытаться найти в данных некую структуру. Задачи, которые ищут закономерности в неразмеченных выборках, называют задачами обучения без учителя. Типичный пример такой задачи — кластеризация, где требуется найти группы похожих объектов.

Кластеризация может использоваться для самых разных целей. В этом задании мы попробуем группировать схожие пиксели на изображении. Такой подход позволяет переходить к суперпиксельному представлению изображений, которое является более компактным и лучше подходит для решения ряда задач компьютерного зрения.

In [91]:
import pandas as pd
import numpy as np
from skimage.io import imread
import skimage
import pylab
from itertools import chain
from sklearn.cluster import KMeans
import math

%matplotlib inline

## 1. Загрузите картинку parrots.jpg. 
Преобразуйте изображение, приведя все значения в интервал от 0 до 1. Для этого можно воспользоваться функцией img_as_float из модуля skimage. Обратите внимание на этот шаг, так как при работе с исходным изображением вы получите некорректный результат.

In [92]:
image = imread('./data/parrots.jpg')
scale_image = skimage.img_as_float64(image)

width, height = len(scale_image[1]), len(scale_image)

## 2. Создайте матрицу объекты-признаки: характеризуйте каждый пиксель тремя координатами - значениями интенсивности в пространстве RGB.

In [93]:
listmerge = lambda lst: list(chain(*chain(*lst)))
val = listmerge(scale_image)
data = pd.DataFrame({'R': val[::3], 'G': val[1::3], 'B': val[2::3]})
data.head()

Unnamed: 0,R,G,B
0,0.015686,0.494118,0.019608
1,0.007843,0.494118,0.007843
2,0.007843,0.494118,0.007843
3,0.007843,0.494118,0.007843
4,0.007843,0.501961,0.011765


## 3. Запустите алгоритм K-Means с параметрами init='k-means++' и random_state=241. 
После выделения кластеров все пиксели, отнесенные в один кластер, попробуйте заполнить двумя способами: медианным и средним цветом по кластеру.

In [94]:
# kmeans = KMeans(init='k-means++', random_state=241, n_jobs=-1)
kmeans = KMeans(init='k-means++', random_state=241)
kmeans

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=241, tol=0.0001, verbose=0)

In [95]:
kmeans8 = kmeans.fit(data)
cluster = pd.Series(kmeans8.labels_, name='lable')
data_c = data.join(cluster)
data_c.head()

Unnamed: 0,R,G,B,lable
0,0.015686,0.494118,0.019608,4
1,0.007843,0.494118,0.007843,4
2,0.007843,0.494118,0.007843,4
3,0.007843,0.494118,0.007843,4
4,0.007843,0.501961,0.011765,4


In [96]:
data_mean = data_c.copy()
for i in range(8):
    data_mean.loc[data_mean['lable'] == i, 'R'] = data_c[data_c['lable'] == i]['R'].mean()
    data_mean.loc[data_mean['lable'] == i, 'G'] = data_c[data_c['lable'] == i]['G'].mean()
    data_mean.loc[data_mean['lable'] == i, 'B'] = data_c[data_c['lable'] == i]['B'].mean()

data_mean.head()

Unnamed: 0,R,G,B,lable
0,0.095647,0.635567,0.066917,4
1,0.095647,0.635567,0.066917,4
2,0.095647,0.635567,0.066917,4
3,0.095647,0.635567,0.066917,4
4,0.095647,0.635567,0.066917,4


In [97]:
data_median = data_c.copy()
for i in range(8):
    data_median.loc[data_median['lable'] == i, 'R'] = data_c[data_c['lable'] == i]['R'].median()
    data_median.loc[data_median['lable'] == i, 'G'] = data_c[data_c['lable'] == i]['G'].median()
    data_median.loc[data_median['lable'] == i, 'B'] = data_c[data_c['lable'] == i]['B'].median()

data_median.head()

Unnamed: 0,R,G,B,lable
0,0.054902,0.647059,0.023529,4
1,0.054902,0.647059,0.023529,4
2,0.054902,0.647059,0.023529,4
3,0.054902,0.647059,0.023529,4
4,0.054902,0.647059,0.023529,4


In [98]:
# _ = np.median(data_c.loc[data_c['lable'] == i, ['R', 'G', 'B']].to_numpy())
# data_median.loc[data_median['lable'] == 3, ['R', 'G', 'B']] = _
# # data_median.loc[data_median['lable'] == 3, 'R'] = _
# # data_median.loc[data_median['lable'] == 3, 'G'] = _
# # data_median.loc[data_median['lable'] == 3, 'B'] = _
# data_median.loc[data_median['lable'] == 3]

## 4. Измерьте качество получившейся сегментации с помощью метрики PSNR. 
Эту метрику нужно реализовать самостоятельно ([см. определение](https://ru.wikipedia.org/wiki/%D0%9F%D0%B8%D0%BA%D0%BE%D0%B2%D0%BE%D0%B5_%D0%BE%D1%82%D0%BD%D0%BE%D1%88%D0%B5%D0%BD%D0%B8%D0%B5_%D1%81%D0%B8%D0%B3%D0%BD%D0%B0%D0%BB%D0%B0_%D0%BA_%D1%88%D1%83%D0%BC%D1%83)).

In [99]:
# MSE = lambda I, K, width, height: 1 / (3 * width * height) * sum([(i['R'] - j['R'])**2 + (i['G'] - j['G'])**2 + (i['B'] - j['B'])**2 for (_, i), (_, j)  in zip(I.iterrows(), K.iterrows())])
# mse = MSE(data_c, data_mean, width, height)
# PSNR = lambda MAX, MSE: 20 * math.log(MAX / math.sqrt(MSE))
# psnr_ = PSNR(1, mse)
# psnr_

In [100]:
# MSE = lambda I, K, width, height: 1 / (3 * width * height) * sum([(i['R'] - j['R'])**2 + (i['G'] - j['G'])**2 + (i['B'] - j['B'])**2 for (_, i), (_, j)  in zip(I.iterrows(), K.iterrows())])
# MSE = lambda I, K, width, height: 1 / (3 * width * height) * sum([(i['R'] - j['R'] + i['G'] - j['G'] + i['B'] - j['B'])**2 for (_, i), (_, j)  in zip(I.iterrows(), K.iterrows())])
# PSNR = lambda I, K, width, height, MAX: 20 * math.log(MAX / math.sqrt(MSE(I, K, width, height)))
# 42.452351239513895

In [101]:
# (data_c['R'] - data_mean['R']).head()
# ((data_c['R'] - data_mean['R'])**2).sum()
MSE = lambda I, K, width, height: ((I['R'] - K['R'])**2 + (I['G'] - K['G'])**2 + (I['B'] - K['B'])**2).sum() / (3 * width * height) # правильная
# MSE = lambda I, K, width, height: ((I['R'] - K['R'])**2 / (width * height) + (I['G'] - K['G'])**2 / (width * height)  + (I['B'] - K['B'])**2 / (width * height)).sum() / 3
PSNR = lambda I, K, width, height, MAX: 20 * math.log10(MAX / math.sqrt(MSE(I, K, width, height)))

In [102]:
psnr_mean = PSNR(data_c, data_mean, width, height, 1)
psnr_mean

18.436829014063576

In [103]:
psnr_median = PSNR(data_c, data_median, width, height, 1)
psnr_median

18.1379714433961

## 5. Найдите минимальное количество кластеров, при котором значение PSNR выше 20 (можно рассмотреть не более 20 кластеров, но не забудьте рассмотреть оба способа заполнения пикселей одного кластера). 
Это число и будет ответом в данной задаче.

In [104]:
# means = []
# medians = []
# clustes = []
# for count_cluster in range(1, 20):
#     kmeans = KMeans(n_clusters=count_cluster, init='k-means++', random_state=241, n_jobs=-1).fit(data)
#     cluster = pd.Series(kmeans.labels_, name='lable')
#     data_c = data.join(cluster)
    
#     data_mean = data_c.copy()
#     for i in range(count_cluster):
#         data_mean.loc[data_mean['lable'] == i, 'R'] = data_c[data_c['lable'] == i]['R'].mean()
#         data_mean.loc[data_mean['lable'] == i, 'G'] = data_c[data_c['lable'] == i]['G'].mean()
#         data_mean.loc[data_mean['lable'] == i, 'B'] = data_c[data_c['lable'] == i]['B'].mean()
    
#     psnr_mean = PSNR(data_c, data_mean, width, height, 1)

#     data_median = data_c.copy()
#     for i in range(count_cluster):
#         data_median.loc[data_median['lable'] == i, 'R'] = data_c[data_c['lable'] == i]['R'].median()
#         data_median.loc[data_median['lable'] == i, 'G'] = data_c[data_c['lable'] == i]['G'].median()
#         data_median.loc[data_median['lable'] == i, 'B'] = data_c[data_c['lable'] == i]['B'].median()
    
#     psnr_median = PSNR(data_c, data_median, width, height, 1)
    
#     clustes.append(count_cluster)
#     means.append(psnr_mean)
#     medians.append(psnr_median)

In [105]:
def compute_psnr(count_cluster):

    kmeans = KMeans(n_clusters=count_cluster, init='k-means++', random_state=241).fit(data)
    cluster = pd.Series(kmeans.labels_, name='lable')
    data_c = data.join(cluster)
    
    data_mean = data_c.copy()
    for i in range(count_cluster):
        data_mean.loc[data_mean['lable'] == i, 'R'] = data_c[data_c['lable'] == i]['R'].mean()
        data_mean.loc[data_mean['lable'] == i, 'G'] = data_c[data_c['lable'] == i]['G'].mean()
        data_mean.loc[data_mean['lable'] == i, 'B'] = data_c[data_c['lable'] == i]['B'].mean()
    
    psnr_mean = PSNR(data_c, data_mean, width, height, 1)

    data_median = data_c.copy()
    for i in range(count_cluster):
        data_median.loc[data_median['lable'] == i, 'R'] = data_c[data_c['lable'] == i]['R'].median()
        data_median.loc[data_median['lable'] == i, 'G'] = data_c[data_c['lable'] == i]['G'].median()
        data_median.loc[data_median['lable'] == i, 'B'] = data_c[data_c['lable'] == i]['B'].median()
    
    psnr_median = PSNR(data_c, data_median, width, height, 1)
    
    return (count_cluster, psnr_mean, psnr_median)

In [106]:
# def compute_psnr(count_cluster):

#     kmeans = KMeans(n_clusters=count_cluster, init='k-means++', random_state=241).fit(data)
#     cluster = pd.Series(kmeans.labels_, name='lable')
#     data_c = data.join(cluster)
    
#     data_mean = data_c.copy()
#     for i in range(count_cluster):
#         data_mean.loc[data_mean['lable'] == i, ['R', 'G', 'B']] = np.mean(data_c.loc[data_c['lable'] == i, ['R', 'G', 'B']].to_numpy())
    
#     psnr_mean = PSNR(data_c, data_mean, width, height, 1)

#     data_median = data_c.copy()
#     for i in range(count_cluster):
#         data_median.loc[data_median['lable'] == i, ['R', 'G', 'B']] = np.median(data_c.loc[data_c['lable'] == i, ['R', 'G', 'B']].to_numpy())
    
#     psnr_median = PSNR(data_c, data_median, width, height, 1)
    
#     return (count_cluster, psnr_mean, psnr_median)

In [107]:
# def compute_psnr(count_cluster):

#     kmeans = KMeans(n_clusters=count_cluster, init='k-means++', random_state=241).fit(data)
#     cluster = pd.Series(kmeans.labels_, name='lable')
#     data_c = data.join(cluster)
    
#     data_mean = pd.concat([pd.Series(np.zeros(len(data_c)), name='mean'), data_c['lable']], axis=1)
#     for i in range(count_cluster):
#         data_mean.loc[data_mean['lable'] == i, 'mean'] = np.mean(data_c.loc[data_c['lable'] == i, ['R', 'G', 'B']].to_numpy())
    
#     psnr_mean = PSNR(data_c, data_mean, width, height, 1)

#     data_median = pd.concat([pd.Series(np.zeros(len(data_c)), name='median'), data_c['lable']], axis=1)
#     for i in range(count_cluster):
#         data_median.loc[data_median['lable'] == i, 'median'] = np.median(data_c.loc[data_c['lable'] == i, ['R', 'G', 'B']].to_numpy())
    
#     psnr_median = PSNR(data_c, data_median, width, height, 1)
    
#     return (count_cluster, psnr_mean, psnr_median)

In [108]:
# %%time
from multiprocessing import Pool
pool = Pool(4)  # Create a multiprocessing Pool
out = pool.map(compute_psnr, range(1, 20))  # process data_inputs iterable with pool

In [109]:
out

[(1, 9.818835544771291, 9.4278404605074),
 (2, 12.080177084931085, 11.655077266907853),
 (3, 13.153487044607246, 12.774948175099809),
 (4, 14.3678179408688, 14.005928721924676),
 (5, 15.52714619730599, 15.177909077346127),
 (6, 16.543994096630183, 16.051176778469276),
 (7, 17.640992508636522, 17.339368491425027),
 (8, 18.436829014063576, 18.1379714433961),
 (9, 19.112672735788138, 18.81388735617062),
 (10, 19.64075663591531, 19.417472515521055),
 (11, 20.127588746514583, 19.84128407389954),
 (12, 20.5925163195166, 20.304411065558547),
 (13, 21.02137967959436, 20.792764442541802),
 (14, 21.29684656191451, 21.041187112763318),
 (15, 21.60717171908997, 21.40644006041166),
 (16, 21.863109812962833, 21.676272681421985),
 (17, 22.11980599277126, 21.919033084798265),
 (18, 22.335498740502754, 22.129324222239887),
 (19, 22.522097823218104, 22.36097494475569)]