# Rapids KMeans

### Loaded with the MNIST dataset, with TensorFlow and Keras.

In [1]:
from cuml.dask.cluster import KMeans
from keras.datasets import mnist
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd
import time
import dask_cudf
import cudf
import cupy

#Load Data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train_reshaped = x_train.reshape((60000, 784))
pca = PCA(2)

print("Number of training images: " + str(len(x_train)))
print("Number of test images: " + str(len(x_test)))
print("Number of unique classes: " + str(len(np.unique(y_train))))

#Transform the data
pca_transformed = pca.fit_transform(x_train_reshaped)
et_eller_andet = pca_transformed

def np2cudf(df):
    # convert numpy array to cuDF dataframe
    df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})
    pdf = cudf.DataFrame()
    for c,column in enumerate(df):
      pdf[str(c)] = df[column]
    return pdf

b = np2cudf(pca_transformed)
csv_cuda_splitted = dask_cudf.from_cudf(b, npartitions=10)

Number of training images: 60000
Number of test images: 10000
Number of unique classes: 10


In [2]:
# Dask cuda imports
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
cluster = LocalCUDACluster()
cuda_client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 42977 instead
  f"Port {expected} is already in use.\n"


In [3]:
t_start = time.time()

#Initialize the class object
kmeans = KMeans(n_clusters=10, client=cuda_client, random_state=0)

#predict the labels of clusters.
label = kmeans.fit_predict(csv_cuda_splitted)

t_stop = time.time()

t_final = t_stop - t_start
t_final

5.879716157913208

In [4]:
print("Labels: " + str(label))

Labels: <dask_cudf.Series | 31 tasks | 10 npartitions>


### Plotting dataimport matplotlib.pyplot as plt

In [5]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

figure(figsize=(8, 6), dpi=80)

#filter rows of original data
#label = label.to_array()
print(kmeans.labels_)
filtered_label0 = et_eller_andet[label == 0]
print(len(filtered_label0))

#Getting unique labels
u_labels = np.unique(label)

0       8
1       3
2       1
3       7
4       1
       ..
5995    5
5996    8
5997    6
5998    8
5999    5
Length: 6000, dtype: int32


TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, To explicitly construct a GPU array, consider using cupy.asarray(...)
To explicitly construct a host array, consider using .to_array()

<Figure size 640x480 with 0 Axes>