In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [3]:
crimes = pd.read_pickle('cleaned_crime_dump')

In [4]:
filtered = crimes[['Victim Sex', 'Victim Descent', 'Weapon Description']].copy()
filteredOneHot = pd.get_dummies(filtered[['Victim Descent', 'Victim Sex', 'Weapon Description']])

In [5]:
pca = PCA(n_components = 2)
pcaFitted = pca.fit_transform(filteredOneHot[0:100000])

In [6]:
x = [pcaFitted[i, 0] for i in range(0,len(pcaFitted))]
y = [pcaFitted[i, 1] for i in range(0,len(pcaFitted))]

In [7]:
kmeans = KMeans(n_clusters = 2).fit(pcaFitted)

In [18]:
#plt.plot(x[0:10000], y[0:10000], 'ro')
#plt.show()

In [8]:
from random import random
from bokeh.models import CustomJS, ColumnDataSource
from bokeh.plotting import figure, output_file, show
from bokeh.layouts import row

#x = [random() for x in range(500)]
#y = [random() for y in range(500)]

s1 = ColumnDataSource(data=dict(x=x, y=y))
p1 = figure(plot_width=400, plot_height=400, tools="pan, box_select, wheel_zoom", title="Cluster 1")
p1.circle('x', 'y', source=s1, alpha=0.6, color = 'black')

s2 = ColumnDataSource(data=dict(x=[], y=[]))
p2 = figure(plot_width=400, plot_height=400, x_range=(0, 1), y_range=(0, 1),
            tools="pan, wheel_zoom", title="Even more clustering")
p2.circle('x', 'y', source=s2, alpha=0.6, color = 'red')

s1.callback = CustomJS(args=dict(s2=s2), code="""
        var inds = cb_obj.get('selected')['1d'].indices;
        var d1 = cb_obj.get('data');
        var d2 = s2.get('data');
        d2['x'] = []
        d2['y'] = []
        for (i = 0; i < inds.length; i++) {
            d2['x'].push(d1['x'][inds[i]])
            d2['y'].push(d1['y'][inds[i]])
        }
        s2.trigger('change');
    """)

layout = row(p1, p2)

show(layout)