
# Kernel Density Estimation

This example shows how kernel density estimation (KDE), a powerful
non-parametric density estimation technique, can be used to learn
a generative model for a dataset.  With this generative model in place,
new samples can be drawn.  These new samples reflect the underlying model
of the data.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import seaborn as sns

from sklearn.datasets import load_digits
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [None]:
import tensorflow as tf

## Load data

In [None]:
# UCI ML
digits = load_digits()
x_train = digits.data
y_train = digits.target
size = int(x_train.shape[1] ** 0.5)

# MNIST
#(x_train, y_train),(x_test, y_test) = tf.keras.datasets.mnist.load_data()
#size = x_train.shape[1]

In [None]:
x_train = x_train.reshape(-1, size, size) / x_train.max()
x_train.shape, y_train.shape

In [None]:
fig, axs = plt.subplots(2, 5, figsize=[10,3])
for i in range (2):
    for j in range (5):
        mask = y_train == i*5 + j
        img = np.mean(x_train[mask], axis=0)
        axs[i,j].imshow(img, vmin=0, vmax=1)
plt.tight_layout()

## PCA

In [None]:
pca = PCA(n_components=0.9, whiten=False)
pca_train = pca.fit_transform(x_train.reshape(-1, size**2))

In [None]:
pca_train.shape

In [None]:
for i in range (10):
    mask = y_train == i
    plt.scatter(pca_train[mask][:, 0], pca_train[mask][:, 1], label=i)
plt.legend(ncol=2)
plt.show()

In [None]:
col = []
for i in range (pca_train.shape[1]):
    col.append(f'pca{i}')
df = pd.DataFrame(pca_train, columns=col)
df['label'] = y_train

## Plot KDE

In [None]:
pca_train_0 = pca_train[:, 0]
kde_g = scipy.stats.gaussian_kde(pca_train_0)
bw = kde_g.covariance_factor()
pca_train_0_lin = np.linspace(pca_train_0.min(), pca_train_0.max(), 10000)
pca_train_0_pdf = kde_g.pdf(pca_train_0_lin)

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(data=df, x='pca0', ax=ax, label=f'seaborn: bw={bw:.3f}')
plt.plot(pca_train_0_lin, pca_train_0_pdf, label=f'scipy: bw={bw:.3f}', ls='--')
sns.kdeplot(data=df, x='pca0', ax=ax, label='seaborn: bw=0.1', bw_method=0.1)
sns.kdeplot(data=df, x='pca0', ax=ax, label='seaborn: bw=0.5', bw_method=0.5)
sns.kdeplot(data=df, x='pca0', ax=ax, label='seaborn: bw=1', bw_method=1)
plt.legend()

## Plot KDE (by digit)

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(data=df, x='pca0', ax=ax, hue='label', palette='tab10')

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(data=df, x='pca0', y='pca1', ax=ax, cbar=True, fill=True, log_scale=True)

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(data=df, x='pca0', y='pca1', ax=ax, hue='label', palette='tab10', levels=5)

## Fit KDE and generate digits

In [None]:
# use grid search cross-validation to optimize the bandwidth
#params = {"bandwidth": np.logspace(-1, 1, 20)}
#grid = GridSearchCV(KernelDensity(), params, verbose=True)
#grid.fit(pca_train)
#print(f"best bandwidth: {grid.best_estimator_.bandwidth}")

# use the best estimator to compute the kernel density estimate
#kde = grid.best_estimator_

In [None]:
kde = KernelDensity(bandwidth=bw).fit(pca_train)#[y_train==9])

In [None]:
# sample 64 new points from the data
n = 16
kde_pca = kde.sample(n)
new_data = pca.inverse_transform(kde_pca).reshape(n, size, size)

In [None]:
fig, axs = plt.subplots(4,4,figsize=[10,10])
for i in range (4):
    for j in range (4):
        axs[i,j].imshow(new_data[i*4+j])
plt.tight_layout()

[Source - scikit learn](https://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html#sphx-glr-auto-examples-neighbors-plot-digits-kde-sampling-py)