In [None]:
%matplotlib inline

import cudf
import cuml
import os
import struct
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numba
import urllib.request

from pml_utils import get_mnist, show_clusters
import sklearn.cluster

In [None]:
def load_not_mnist(directory, filename):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        print('Not downloading, file already exists:', filepath)
    else:
        if not os.path.isdir(directory):
            os.mkdir(directory)
        url_base = 'https://a3s.fi/mldata/'
        url = url_base + filename
        print('Downloading {} to {}'.format(url, filepath))
        urllib.request.urlretrieve(url, filepath)
    return np.load(filepath)

In [None]:
# Load notMNIST
DATA_DIR = os.path.expanduser('~/data/notMNIST/')
X_train = load_not_mnist(DATA_DIR, 'notMNIST_large_images.npy').reshape(-1, 28*28)
y_train = load_not_mnist(DATA_DIR, 'notMNIST_large_labels.npy')
X_test = load_not_mnist(DATA_DIR, 'notMNIST_small_images.npy').reshape(-1, 28*28)
y_test = load_not_mnist(DATA_DIR, 'notMNIST_small_labels.npy')

In [None]:
# Load MNIST
#X_train, y_train, X_test, y_test = get_mnist('MNIST')

In [None]:
print()
print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))
print('X_train:', type(X_train), 'shape:', X_train.shape)
print('y_train:', type(y_train), 'shape:', y_train.shape)
print('X_test:', type(X_test), 'shape:', X_test.shape)
print('y_test:', type(y_test), 'shape:', y_test.shape)

In [None]:
pltsize=1
plt.figure(figsize=(10*pltsize, pltsize))

for i in range(10):
    plt.subplot(1,10,i+1)
    plt.axis('off')
    plt.imshow(X_train[i,:].reshape(28, 28), cmap="gray")
    plt.title('Class: '+str(y_train[i]))

In [None]:
def np2cudf(df):
    # convert numpy array to cuDF dataframe
    df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})
    pdf = cudf.DataFrame()
    for c,column in enumerate(df):
      pdf[str(c)] = df[column]
    return pdf

In [None]:
#%%time
#cu_X_train = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))

In [None]:
#%%time
#cu_X_train = np2cudf(X_train)

In [None]:
N=50000
X_train = X_train[:N,:]
y_train = y_train[:N]

In [None]:
%%time
X_train_cuda = numba.cuda.to_device(X_train.astype(np.float32))
y_train_cuda = numba.cuda.to_device(y_train.view(np.int32)-ord('A'))
X_test_cuda = numba.cuda.to_device(X_test.astype(np.float32))


In [None]:
y_test_int = y_test.view(np.int32)-ord('A')

In [None]:
#%%time
#k=10
#kmeans = cuml.KMeans(n_clusters=k)
#kmeans.fit(X_train_cuda)

In [None]:
#show_clusters(kmeans.labels_.copy_to_host(), k, X_train)

In [None]:
#%%time
#kmeans = sklearn.cluster.KMeans(n_clusters=k)
#kmeans.fit(X_train)

In [None]:
%%time
rf_clf = cuml.ensemble.RandomForestClassifier(max_features=1.0, n_estimators=100, max_depth=12)
rf_clf.fit(X_train_cuda, y_train_cuda)

In [None]:
preds = rf_clf.predict(X_test_cuda)

In [None]:
accuracy_score(y_test, preds)