# cuML Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

## Imports

In [1]:
import cudf
import cuml
import numpy as np
import cupy as cp

## Create classification dataset

In [2]:
X, y = cuml.make_classification(
    n_samples=10000
    , n_classes=2
    , n_features=4
    , n_informative=2
    , flip_y=0.05
    , shift=cp.random.rand(4)
    , scale=cp.random.rand(4)
    , random_state=np.random.randint(1e9)
)

df_class = cudf.DataFrame(X, columns=[f'feat_{i}' for i in range(4)])
df_class['label'] = cudf.Series(y)
df_class.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,label
0,-0.036177,-0.127668,0.078793,0.05146,0
1,-0.092427,0.158353,0.011045,0.007617,0
2,0.578539,0.314366,0.057801,0.046949,1
3,-0.237328,0.049713,0.044973,-0.036342,1
4,-0.386364,-0.04849,0.030925,-0.008429,0


In [3]:
X_train, X_test, y_train, y_test = cuml.preprocessing.train_test_split(df_class, 'label', train_size=.8)

---

# Classification models

---

#### LogisticRegression()

In [4]:
log_reg = cuml.linear_model.LogisticRegression()

In [5]:
log_reg = cuml.linear_model.LogisticRegression(
    fit_intercept=True
    , max_iter=1000
    , tol=1e-5
)

In [6]:
log_reg.fit(X_train, y_train)

LogisticRegression(penalty='l2', tol=1e-05, C=1.0, fit_intercept=True, max_iter=1000, linesearch_max_iter=50, verbose=4, l1_ratio=None, solver='qn', handle=<cuml.raft.common.handle.Handle object at 0x7f03937d3c30>, output_type='input')

In [7]:
print(f'Coefficients: \n{log_reg.coef_}')

Coefficients: 
0    -1.363609
1    15.584369
2    15.744004
3    -1.601119
dtype: float32


In [8]:
print(f'Intercept: \n{log_reg.intercept_}')

Intercept: 
0   -2.166005
dtype: float32


In [9]:
log_reg.predict(X_test)

0       1
1       1
2       1
3       0
4       0
       ..
1995    0
1996    0
1997    1
1998    0
1999    0
Length: 2000, dtype: int64

In [10]:
log_reg.predict_proba(X_test)

Unnamed: 0,0,1
0,0.084432,0.915568
1,0.117180,0.882820
2,0.116625,0.883375
3,0.949007,0.050993
4,0.967065,0.032935
...,...,...
1995,0.987002,0.012998
1996,0.918191,0.081809
1997,0.047790,0.952210
1998,0.780962,0.219038


#### MBSGDClassifier()

In [11]:
mbsgd_c = cuml.MBSGDClassifier()

In [12]:
mbsgd_c = cuml.MBSGDClassifier(
    penalty='elasticnet'
    , alpha=0.001
    , batch_size=64
    , fit_intercept=True
    , eta0=0.002
    , learning_rate='adaptive'
)

In [13]:
mbsgd_c.fit(X_train, y_train)

MBSGDClassifier(loss='hinge', penalty='elasticnet', alpha=0.001, l1_ratio=0.15, fit_intercept=True, epochs=1000, tol=0.001, shuffle=True, learning_rate='adaptive', eta0=0.002, power_t=0.5, batch_size=64, n_iter_no_change=5, handle=<cuml.raft.common.handle.Handle object at 0x7f0391c28c90>, verbose=4, output_type='input')

In [14]:
mbsgd_c.predict(X_test)

0       1
1       1
2       1
3       0
4       0
       ..
1995    0
1996    0
1997    1
1998    0
1999    0
Length: 2000, dtype: int64

#### MultinomialNB()

In [15]:
df_nb = cudf.DataFrame([
      ('a', 1)
    , ('b', 1)
    , ('a', 1)
    , ('b', 1)
    , ('c', 0)
    , ('b', 1)
    , ('b', 1)
    , ('c', 0)
    , ('a', 1)
    , ('b', 1)
], columns=['f_0', 'label'])

In [16]:
from cuml.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer()

X = hv.fit_transform(df_nb['f_0'])
y = df_nb['label']

nb = cuml.MultinomialNB()
nb.fit(X, y)

MultinomialNB(alpha=1.0, fit_prior=True, output_type='input', handle=None, verbose=4)

In [17]:
nb.predict(X)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [18]:
nb.predict_proba(X)

array([[0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8]], dtype=float32)

In [19]:
nb.score(X, y)

0.800000011920929

#### ensemble.RandomForestClassifier()

In [20]:
rf = cuml.ensemble.RandomForestClassifier()

In [21]:
rf = cuml.ensemble.RandomForestClassifier(
    n_estimators=40
    , n_bins=8
    , max_depth=10
    , max_features=1.0
    , min_rows_per_node=10
    , split_criterion=1
)

  rf = cuml.ensemble.RandomForestClassifier(


In [22]:
rf.fit(X_train, y_train)

RandomForestClassifier(split_criterion=1, handle=<cuml.raft.common.handle.Handle object at 0x7f0391b7d210>, verbose=4, output_type='input')

In [23]:
rf.predict(X_test)

0       1.0
1       1.0
2       1.0
3       0.0
4       0.0
       ... 
1995    0.0
1996    0.0
1997    1.0
1998    0.0
1999    0.0
Length: 2000, dtype: float32

In [24]:
rf.predict_proba(X_test)

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0
...,...,...
1995,1.0,0.0
1996,1.0,0.0
1997,0.0,1.0
1998,1.0,0.0


In [25]:
rf.score(X_test, y_test)

0.9695000052452087

#### ForestInference()

In [None]:
from cuml import ForestInference

model_path = 'xgb.model'
fm = ForestInference.load(model_path, output_class=True)
fm.predict(X_test)

#### svm.SVC()

In [26]:
svc = cuml.svm.SVC(
    kernel='poly'
    , degree=2
    , gamma='scale'
    , probability=True
)

In [27]:
svc.fit(X_train, y_train)

SVC(handle=<cuml.raft.common.handle.Handle object at 0x7f0391b7d710>, C=1, kernel='poly', degree=2, gamma='scale', coef0=0.0, tol=0.001, cache_size=1024.0, max_iter=-1, nochange_steps=1000, verbose=4, output_type='input', probability=True, random_state=None, class_weight=None, multiclass_strategy='ovo')

In [28]:
svc.predict(X_test)

0       1
1       1
2       1
3       0
4       0
       ..
1995    0
1996    0
1997    1
1998    0
1999    0
Length: 2000, dtype: int64

In [29]:
svc.predict_proba(X_test)

Unnamed: 0,0,1
0,0.008688,0.991312
1,0.015431,0.984569
2,0.018290,0.981710
3,0.918325,0.081675
4,0.980212,0.019788
...,...,...
1995,0.995938,0.004062
1996,0.965259,0.034741
1997,0.003363,0.996637
1998,0.970907,0.029093


#### neighbors.KNeighborsClassifier()

In [30]:
knn_c = cuml.neighbors.KNeighborsClassifier()

In [31]:
knn_c = cuml.neighbors.KNeighborsClassifier(
    n_neighbors = 5
)

In [32]:
knn_c.fit(X_train, y_train)

KNeighborsClassifier(weights='uniform')

In [33]:
knn_c.predict(X_test)

0       1
1       1
2       1
3       0
4       0
       ..
1995    0
1996    0
1997    1
1998    0
1999    0
Length: 2000, dtype: int64

In [34]:
knn_c.predict_proba(X_test)

Unnamed: 0,0,1
0,0.2,0.8
1,0.0,1.0
2,0.2,0.8
3,1.0,0.0
4,1.0,0.0
...,...,...
1995,1.0,0.0
1996,1.0,0.0
1997,0.0,1.0
1998,1.0,0.0


---

# Classification metrics

---

#### metrics.accuracy.accuracy_score()

In [35]:
cuml.metrics.accuracy.accuracy_score(y_test, knn_c.predict(X_test))

0.9679999947547913

#### metrics.confusion_matrix()

In [36]:
cuml.metrics.confusion_matrix(y_test, knn_c.predict(X_test))

Unnamed: 0,0,1
0,969,29
1,35,967


#### metrics.roc_auc_score()

In [37]:
cuml.metrics.roc_auc_score(y_test, knn_c.predict(X_test))

0.968005895614624

#### metrics.precision_recall_curve()

In [38]:
cuml.metrics.precision_recall_curve(y_test, knn_c.predict(X_test))

(0    0.501000
 1    0.970884
 2    1.000000
 dtype: float64,
 0    1.00000
 1    0.96507
 2    0.00000
 dtype: float64,
 0    0
 1    1
 dtype: int64)

#### metrics.pairwise_distances.pairwise_distances()

In [39]:
cuml.metrics.pairwise_distances(y_test.astype('float32'), knn_c.predict(X_test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
3,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1996,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
1997,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
1998,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
