In [None]:
import os, sys
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.patches as patches
import hdbscan

# for splines
from __future__ import division
from scipy import interpolate

# for coloured density plots
import matplotlib.colors
from scipy.stats import kde

# for density / contours
import seaborn as sns

# for clustering text
from sklearn.cluster import AffinityPropagation
import distance

In [None]:
df = pd.read_csv("data/lichess_umap_seed0_no_duplicate_projection_eco_games.csv")
# df = df.fillna('')
# df.head()

In [None]:
chessboard_black = '#EDEEEF'
chessboard_white = '#FFFFFF'
piece_black = '#000000'
piece_white = '#FF0000'#'#FFFFFF'
pieces_dict = {
#   'wr': '♖',
#   'wn': '♘',
#   'wb': '♗',
#   'wk': '♔',
#   'wq': '♕',
#   'wp': '♙',
  'wr': '♜',
  'wn': '♞',
  'wb': '♝',
  'wk': '♚',
  'wq': '♛',
  'wp': '♟',
  'br': '♜',
  'bn': '♞',
  'bb': '♝',
  'bk': '♚',
  'bq': '♛',
  'bp': '♟',
  '': ''
}
x_labels = ['a','b','c','d','e','f','g','h']
y_labels = [str(y) for y in range(1,9)]
features = np.array(np.meshgrid(x_labels,y_labels)).T.reshape(-1,2)
features = [''.join(s) for s in features.tolist()]
colours = ['#1B9E77', '#D95F02', '#E7298A', '#7570B3', '#66A61E']


def get_piece_colour(piece):
#     return 'black'
    if piece == '':
        return 'white'
    return piece_black if piece[0] == 'b' else piece_white

def get_colour_indices(c):
    c_set = sorted(set(c))
    c_idx = list(range(len(c_set)))
    c_dict = dict(zip(list(c_set), c_idx))
    return [c_dict[x] for x in c], c_dict

def plot_df(ax, df, c_dict, alpha=1.0, zorder=0, linewidth=1):
    x = df['x'].values
    y = df['y'].values
    c = df['algo'].values
    c = colours[c_dict[c[0]]]
    ax.plot(x, y, c=c, alpha=alpha, zorder=zorder, linewidth=linewidth)
    

def plot_df_splines(ax, df, c_dict, alpha=1.0, zorder=0, linewidth=1, spline_smoothing=0):
    df = df.drop_duplicates(subset=['x','y'])
    x = df['x'].values
    y = df['y'].values
    c = df['algo'].values
    c = colours[c_dict[c[0]]]

    tck, u = interpolate.splprep([x,y], s=spline_smoothing, k=min(3, len(x)-1))
    xnew,ynew = interpolate.splev(np.linspace(0, 1, len(x)*10), tck, der = 0)
    ax.plot(x, y, 'o', xnew, ynew, c=c, alpha=alpha, zorder=zorder, linewidth=linewidth)

    
def get_board_colour(i,j):
    if i%2 == 0:
        if j%2 == 0:
            return chessboard_black
        else:
            return chessboard_white
    else:
        if j%2 == 0:
            return chessboard_white
        else:
            return chessboard_black

def get_inverse_board_colour(i, j):
    c = get_board_colour(i, j)
    if c == chessboard_white:
        return chessboard_black
    else:
        return chessboard_white
        
            
def render_piece(piece, ax, x, y, tile_size, alpha, zorder):
    y1, y2 = plt.gca().get_window_extent().get_points()[:, 1]
    ymin, ymax = ax.get_ylim()
    yscale = (y2-y1)/(ymax-ymin)
    fontsize = tile_size*yscale
    ax.text(x, y, pieces_dict[piece], size=fontsize, color=get_piece_colour(piece), alpha=alpha, zorder=zorder)
            
            
def render_chessboard(ax, x, y, size, zorder, chessboard_dict):
    tile_s = size/8
    x_o = x-size/2
    y_o = y-size/2
    rect = patches.Rectangle((x_o-size/32, y_o-size/32), size*34/32, size*34/32, linewidth=1, edgecolor='black', facecolor='black', zorder=zorder)
    ax.add_patch(rect)
    for i in range(8):
        for j in range(8):
            x_now = x_o + i * tile_s
            y_now = y_o + j * tile_s
            c = get_board_colour(i, j)
            rect = patches.Rectangle((x_now, y_now), tile_s, tile_s, linewidth=1, edgecolor=c, facecolor=c, zorder=zorder)
            ax.add_patch(rect)
            piece, alpha = chessboard_dict[x_labels[i]+y_labels[j]]
            render_piece(piece, ax, x_now, y_now, tile_s, alpha, zorder)

            
def cluster_chessboard_dict(cluster_i):
    chessboard_dict = {}
    for f in features:
        piece, frequency = get_piece_frequency_from_cluster_field(cluster_i, f)
        chessboard_dict[f] = (piece, frequency)
    return chessboard_dict
            
            
def get_cluster_positions(df, clusterer, drop_noise=True):
    # create cluster_array where each sample represents one cluster and the 1st and 2nd column represent x and y coordinate of that cluster
    x = df['x'].values
    y = df['y'].values
    
    # add labels and probabilities to each sample, calculate cluster x/y/count
    samples = np.zeros(shape=(len(x), 4))
    samples[:,0] = x
    samples[:,1] = y
    clusters = np.zeros(shape=(len(set(clusterer.labels_)), 2))
    counts = np.zeros(shape=(len(set(clusterer.labels_))))
    for i in range(len(clusterer.labels_)):
        c_id = clusterer.labels_[i]
        c_prob = clusterer.probabilities_[i]
        samples[i,2] = c_id
        samples[i,3] = c_prob
        
        clusters[c_id,0] += samples[i,0]# * c_prob
        clusters[c_id,1] += samples[i,1]# * c_prob
        counts[c_id] += 1
        
    clusters = (clusters.T / counts).T
    return clusters[:-1], counts[:-1]


# method for accessing cluster i, field f and return most frequent piece, and its relative frequency
def get_piece_frequency_from_cluster_field(cluster_i, field, ignore_empty_fields=True):
    counts = df.loc[df['cluster']==cluster_i][field].fillna('').value_counts(normalize=True)
    if ignore_empty_fields and '' in counts and len(counts) != 1:
        counts = counts.drop('')
    return counts.idxmax(), counts[counts.idxmax()]


# map the relative frequency of the cluster into a given scale depending on min and max frequencies
def get_cluster_scale(cluster_i, min_scale, max_scale, drop_noise=True):
#     i_scale = df[['cluster']].value_counts(normalize=True)[cluster_i]
    if drop_noise: 
        counts = cluster_value_counts_without_noise
    else:
        counts = cluster_value_counts
    min1, max1, min2, max2, v1 = counts.min(), counts.max(), min_scale, max_scale, counts[cluster_i]
    return (v1-min1)/(max1-min1) * (max2-min2) + min2


# https://www.python-graph-gallery.com/85-density-plot-with-matplotlib
def render_density(ax, df):
    # create data
    x = df['x'].values
    y = df['y'].values

    # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
    nbins=100
    k = kde.gaussian_kde([x,y])
    xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
    zi = k(np.vstack([xi.flatten(), yi.flatten()]))
    
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["white","grey"])
    ax.pcolormesh(xi, yi, zi.reshape(xi.shape), shading='auto', cmap=cmap, alpha=1.0, zorder=-1)
    
    
# https://seaborn.pydata.org/generated/seaborn.kdeplot.html
def render_contours(ax, df, fill=False, alpha=0.5):
    sns.kdeplot(data=df, x='x', y='y', hue='cluster', fill=fill, ax=ax, alpha=alpha, zorder=-2)
    
    
def get_text_clusters(words: np.ndarray):    
    # words = "open opening openings stem stemming stemmed".split(" ") #Replace this line
    # words = np.asarray(words) #So that indexing with a list will work
    lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

    affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)
    cluster_dict = {}
    for cluster_id in np.unique(affprop.labels_):
        exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
        cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
        cluster_str = ", ".join(cluster)
#         print(" - *%s:* %s" % (exemplar, cluster_str))
        cluster_dict[exemplar] = cluster
    return cluster_dict

In [None]:
x = df['x'].values
y = df['y'].values
c = df['algo'].values

cs, c_dict = get_colour_indices(c)
c = [colours[i] for i in cs]
print(c_dict)
print(colours)


In [None]:
X = np.array([x,y]).T

# TODO clustering based on ECO codes or opening names (get sorted list of most common substrings and then k biggest clusters)

min_cluster_size = 50
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
clusterer.fit(X)
clusters, counts = get_cluster_positions(df, clusterer, drop_noise=True)
df['cluster'] = clusterer.labels_
cluster_value_counts = df[['cluster']].value_counts(normalize=True)
cluster_value_counts_without_noise = df[df['cluster']!=-1][['cluster']].value_counts(normalize=True)

In [None]:
zorder = 0
size = 200
fig, axes = plt.subplots(1,1)
ax = axes
fig.set_figheight(size)
fig.set_figwidth(size)
ax.axis('equal')
ax.scatter(x,y,c=c,zorder=zorder,s=size*10,alpha=0.5)
# ax.scatter(clusters[:,0], clusters[:,1], s=size*50, zorder=zorder)
dfs = dict(tuple(df.groupby('line')))
for key in dfs:
    # TODO consider catmull rom splines - code : https://en.wikipedia.org/wiki/Centripetal_Catmull%E2%80%93Rom_spline
    plot_df_splines(ax, dfs[key], c_dict, alpha=0.1, zorder=zorder, linewidth=size/20, spline_smoothing=0.01)
ax.axis( [ x.min() - 1 , x.max() + 1 , y.min() - 1 , y.max() + 1 ] )

# TODO also consider physics to stop them from overlapping, and maybe a line that attaches them to their original position
for (i, cluster_i) in enumerate(zip(range(len(clusters)))):
#     render_density(ax, df[df['cluster']==cluster_i])
#     render_contours(ax, df[df['cluster']==cluster_i], fill=True)
    scale = get_cluster_scale(cluster_i, 1.0/4, 1.0)
    render_chessboard(ax, clusters[cluster_i,0], clusters[cluster_i,1], scale, zorder+scale+1, cluster_chessboard_dict(cluster_i))

fig.savefig('cluster_size_'+str(min_cluster_size)+'.png', bbox_inches='tight')
plt.close()

In [None]:
# TODO once clusters are connected consider animated change of summary visualization versus difference visualization as labels, given that we do not have actions between samples in the data
# TODO consdier difference visualization where 2 chessboards are overlayed and slightly juxtaposed
# such that within each field we see previous and following piece, encoded by different colours