# Distributed Nearest Neighbors with cuML

In [1]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

cluster = LocalCUDACluster()
client = Client(cluster)

In [2]:
from blazingsql import BlazingContext

bc = BlazingContext(dask_client=client, network_interface='lo')

BlazingContext ready


In [3]:
import dask_cudf

df = dask_cudf.read_csv('https://github.com/gumdropsteve/datasets/raw/master/iris.csv')

In [4]:
bc.create_table('iris', df)

In [5]:
# df.tail()

In [6]:
# df.compute().to_pandas().plot(kind='scatter', x='sepal_length', y='petal_width', c='target', cmap=('spring'), sharex=False)

In [7]:
# df.species.compute().unique()

## Nearest Neighbors
Nearest Neighbors enables the query of the k-nearest neighbors from a set of input samples.

In [8]:
from cuml.dask.neighbors import NearestNeighbors

"""Multi-node Multi-GPU NearestNeighbors Model."""

nn = NearestNeighbors(client=client)

In [9]:
# X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
X = bc.sql('select sepal_length, sepal_width, petal_length, petal_width from iris')

X.tail(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In [10]:
# pass features and labels into model
nn.fit(X)

<cuml.dask.neighbors.nearest_neighbors.NearestNeighbors at 0x7fdbc80549d0>

NearestNeighbors returns a tuple of distances and indices.

distances: cuDF DataFrame or numpy ndarray
    The distances of the k-nearest neighbors for each column vector
    in X

indices: cuDF DataFrame of numpy ndarray
    The indices of the k-nearest neighbors for each column vector in X

In [11]:
distances, indicies = nn.kneighbors(X, n_neighbors=3)

In [12]:
distances.tail(3)

Unnamed: 0,0,1,2
147,0.0,0.223631,0.346403
148,0.0,0.244975,0.300019
149,0.0,0.282832,0.316213


In [13]:
# indicies.tail(3)