# cuML Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

## Imports

In [1]:
import cudf
import cuml
import numpy as np
import cupy as cp

## Create regression dataset

In [2]:
X, y, c = cuml.make_regression(
    n_samples=10000
    , n_targets=1
    , n_features=4
    , n_informative=2
    , bias=-3.4
    , noise=0.2
    , coef=True
    , random_state=np.random.randint(1e9)
)

print(f'coefficients:\n {cp.array(c)}')

df_reg = cudf.DataFrame(X, columns=[f'feat_{i}' for i in range(4)])
df_reg['target'] = cudf.Series(y)
df_reg.head()

coefficients:
 [[ 0.      ]
 [ 0.      ]
 [21.947212]
 [82.2553  ]]


Unnamed: 0,feat_0,feat_1,feat_2,feat_3,target
0,-1.056957,1.879557,0.886296,-1.907221,-140.787689
1,-0.57225,-1.620277,-0.018567,-0.740552,-64.641685
2,-1.42094,-0.015144,-1.60533,0.854558,31.677322
3,0.718283,-0.616024,-2.313588,-1.976617,-216.65036
4,0.161272,-0.925325,-1.330421,-1.242758,-134.817703


In [3]:
X_train, X_test, y_train, y_test = cuml.preprocessing.train_test_split(df_reg, 'target', train_size=.8)

---

# Regression models

---

#### LinearRegression()

In [4]:
lr = cuml.LinearRegression(
    algorithm='svd'
    , fit_intercept=True
    , normalize=True
)

In [5]:
lr.fit(X_train, y_train)

LinearRegression(algorithm='svd', fit_intercept=True, normalize=True, handle=<cuml.raft.common.handle.Handle object at 0x7f66503bf7d0>, verbose=4, output_type='input')

In [6]:
lr.predict(X_test)

0        76.993301
1      -109.247971
2      -117.588669
3        11.311052
4       -96.926559
           ...    
1995    -11.950909
1996    135.429581
1997    -17.160917
1998     48.706272
1999    -86.134995
Length: 2000, dtype: float32

#### Ridge()

In [7]:
ridge = cuml.Ridge(
    alpha=1.0
    , solver='svd'
    , fit_intercept=False
    , normalize=True
)

In [8]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, solver='svd', fit_intercept=False, normalize=True, handle=<cuml.raft.common.handle.Handle object at 0x7f664e80a590>, output_type='input', verbose=4)

In [9]:
ridge.predict(X_test)

0        80.423904
1      -105.882156
2      -114.224747
3        14.723981
4       -93.547882
           ...    
1995     -8.551248
1996    138.870865
1997    -13.768234
1998     52.128941
1999    -82.764587
Length: 2000, dtype: float32

#### Lasso()

In [10]:
lasso = cuml.Lasso(
    alpha=1.0
    , fit_intercept=False
    , normalize=True
)

In [11]:
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, fit_intercept=False, normalize=True, max_iter=1000, tol=0.001, selection='cyclic', handle=<cuml.raft.common.handle.Handle object at 0x7f664e80a7d0>, output_type='input', verbose=4)

In [12]:
lasso.predict(X_test)

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1995    0.0
1996    0.0
1997    0.0
1998    0.0
1999    0.0
Length: 2000, dtype: float32

#### ElasticNet()

In [13]:
elastic = cuml.ElasticNet()

In [14]:
elastic = cuml.ElasticNet(
    alpha=1.0
    , l1_ratio=0.05
    , fit_intercept=False
    , normalize=True
)

In [15]:
elastic.fit(X_train, y_train)

ElasticNet(alpha=1.0, l1_ratio=0.05, fit_intercept=False, normalize=True, max_iter=1000, tol=0.001, selection='cyclic', handle=<cuml.raft.common.handle.Handle object at 0x7f664e80a570>, output_type='input', verbose=4)

In [16]:
elastic.predict(X_test)

0        83.945335
1      -110.601067
2      -119.428703
3        15.206996
4       -97.838127
           ...    
1995     -8.840338
1996    145.374237
1997    -14.234616
1998     54.425571
1999    -86.336205
Length: 2000, dtype: float32

#### ensemble.RandomForestRegressor()

In [17]:
rf_reg = cuml.ensemble.RandomForestRegressor(
    n_estimators=40
    , n_bins=8
    , max_depth=10
    , max_features=1.0
    , min_rows_per_node=10
    , split_criterion=2
)

  rf_reg = cuml.ensemble.RandomForestRegressor(


In [18]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(split_criterion=2, accuracy_metric='r2', handle=<cuml.raft.common.handle.Handle object at 0x7f664e80add0>, verbose=4, output_type='input')

In [19]:
rf_reg.predict(X_test)

0        61.459919
1      -123.958580
2      -131.862335
3        15.058478
4       -84.684525
           ...    
1995     -7.288130
1996    176.835785
1997     -8.890825
1998     65.810875
1999   -100.987976
Length: 2000, dtype: float32

#### svm.SVR()

In [20]:
svr = cuml.svm.SVR(
    kernel='linear'
)

In [21]:
svr.fit(X_train, y_train)

SVR(handle=<cuml.raft.common.handle.Handle object at 0x7f664e80a8d0>, C=1, kernel='linear', degree=3, gamma='scale', coef0=0.0, tol=0.001, epsilon=0.1, cache_size=1024.0, max_iter=-1, nochange_steps=1000, verbose=4, output_type='input')

In [22]:
svr.predict(X_test)

0        76.990593
1      -109.241234
2      -117.584633
3        11.310186
4       -96.922218
           ...    
1995    -11.945328
1996    135.423691
1997    -17.157360
1998     48.707714
1999    -86.131752
Length: 2000, dtype: float32

#### neighbors.KNeighborsRegressor()

In [23]:
knn_r = cuml.neighbors.KNeighborsRegressor(
    n_neighbors = 5
)

In [24]:
knn_r.fit(X_train, y_train)

KNeighborsRegressor(weights='uniform')

In [25]:
knn_r.predict(X_test)

0        75.997147
1      -113.730713
2      -116.470947
3        18.328432
4       -98.607513
           ...    
1995    -19.172421
1996    128.574951
1997    -20.096853
1998     42.203133
1999    -80.536674
Length: 2000, dtype: float32

---

# Regression metrics

---

#### metrics.regression.mean_absolute_error()

In [26]:
cuml.metrics.regression.mean_absolute_error(y_test, lr.predict(X_test))

array(0.16377503, dtype=float32)

#### metrics.regression.mean_squared_error()

In [27]:
cuml.metrics.regression.mean_squared_error(y_test, lr.predict(X_test), squared=False)

array(0.20261249, dtype=float32)

#### metrics.regression.mean_squared_log_error()

In [None]:
cuml.metrics.regression.mean_squared_log_error(y_test, lr.predict(X_test), squared=False)

#### metrics.regression.r2_score()

In [28]:
cuml.metrics.regression.r2_score(y_test, rf_reg.predict(X_test))

0.9430750608444214