# Saving and loading a model

In [2]:
from cuml import accel
accel.install()
accel.enabled()

True

In [3]:
import polars as pl
import cupy as cp

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

from cuml.model_selection import train_test_split
import pickle
import joblib

In [4]:
from collections.abc import Mapping

def eval_classifier_preds(y_true, y_preds) -> Mapping[str, float]:
    """
    Performs an evaluation of a classification model based on true and predicted values.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)

    print(f'Accuracy: {accuracy * 100 :.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1: {f1:.2f}')
    
    return {
        'accuracy': round(accuracy, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1, 2),
    }

    

In [5]:
hd = pl.read_csv('./heart-disease.csv')
hds = hd.sample(shuffle=True, fraction=1)        # Shuffles data 

In [6]:
X = hds.drop('target', strict=True).to_pandas()
y = hds.get_column('target').to_pandas()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
clf = RandomForestClassifier(n_jobs=10)
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': 10,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [9]:
grid = {"n_estimators": [100, 200, 500],
        "max_depth": [None, 10],
        "max_features": ["log2", "sqrt"],
        "min_samples_split": [2, 4],
        "min_samples_leaf": [1, 2, 4]}



In [10]:
rs_clf = RandomizedSearchCV(
    estimator=clf,
    param_distributions=grid,
    n_iter=5,
    cv=10,
    verbose=2
)

In [11]:
# Inputs MUST be converted to numpy!
rs_clf.fit(X_train.to_cupy().get(), y_train.to_cupy().get())

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, 

In [12]:
rs_clf.best_params_

{'n_estimators': 200,
 'min_samples_split': 4,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': None}

In [14]:
tuned_predictions = rs_clf.predict(X_test.to_cupy().get())
eval_classifier_preds(y_test.to_cupy().get(), tuned_predictions)
# {'accuracy': 0.8, 'precision': 0.81, 'recall': 0.81, 'f1_score': 0.81}    # Not quite better actually!

Accuracy: 76.67
Precision: 0.69
Recall: 0.93
F1: 0.79


{'accuracy': 0.77, 'precision': 0.69, 'recall': 0.93, 'f1_score': 0.79}

In [None]:
with open('./heart_disease_random_forest_tuned.pkl', 'wb') as f:
    pickle.dump(rs_clf, f)

In [None]:
model = pickle.load(open('./heart_disease_random_forest_tuned.pkl', 'rb'))
model.predict(X_test.to_cupy().get())

In [15]:
joblib.dump(rs_clf, './heart_disease_random_forest_tuned.joblib')

['./heart_disease_random_forest_tuned.joblib']

In [16]:
reloaded_model = joblib.load('./heart_disease_random_forest_tuned.joblib')

In [18]:
reloaded_model.predict(X_test.to_cupy().get())

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1])

In [19]:
type(X_test)

cudf.core.dataframe.DataFrame