# Improving a model

In [1]:
from cuml import accel
accel.install()
accel.enabled()

True

In [47]:
import polars as pl
import cupy as cp

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    accuracy_score,
    roc_curve,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score
)

from sklearn.datasets import fetch_california_housing

from cuml.ensemble import RandomForestClassifier as cuRFC, RandomForestRegressor as cuRFR
from cuml.model_selection import train_test_split


In [4]:
housing = fetch_california_housing()
housing_df = pl.DataFrame(data=housing['data'], schema=housing['feature_names'])
housing_df = housing_df.with_columns(target=housing['target'])

In [5]:
X = housing_df.select(pl.all().exclude('target')).to_pandas()
y = housing_df.get_column('target').to_pandas()

In [6]:
cp.random.seed(0)

clf = RandomForestRegressor()         # from sklearn
cval_mse = cp.mean(cross_val_score(clf, X, y, cv=5, scoring='neg_mean_squared_error'))

cval_mse

np.float64(-0.4357816650794407)

In [71]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Hyperparameter Tuning/Optimization (HPO)

1. Manually
2. Randomly (RandomSearchCV)
3. Exhaustively (GridSearchCV)

### Manual Hyperparameter Tuning

This requires a **validation set**.

In [3]:
hd = pl.read_csv('./heart-disease.csv')
hds = hd.sample(shuffle=True, fraction=1)        # Shuffles data 

In [4]:
X = hds.drop('target', strict=True).to_pandas()
y = hds.get_column('target').to_pandas()

In [5]:
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

Adjusting:
* `max_depth`
* `max_features`
* `min_samples_leaf`
* `min_samples_split`
* `n_estimators`

In [6]:
from collections.abc import Mapping

def eval_classifier_preds(y_true, y_preds) -> Mapping[str, float]:
    """
    Performs an evaluation of a classification model based on true and predicted values.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)

    print(f'Accuracy: {accuracy * 100 :.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1: {f1:.2f}')
    
    return {
        'accuracy': round(accuracy, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1, 2),
    }

    

In [9]:
# Indexes to split the data
train_split = round(0.7 * len(hd))
validation_split = round(train_split + 0.15 * len(hd))

train_split, validation_split

(212, 257)

In [10]:
X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:validation_split], y[train_split:validation_split]
X_test, y_test = X[validation_split:], y[validation_split:]

len(X_train), len(X_valid), len(X_test)

(212, 45, 46)

In [11]:
clf_base = RandomForestClassifier()
clf_base.fit(X_train, y_train)
clf_base_preds = clf_base.predict(X_valid)
clf_base_metrics = eval_classifier_preds(y_valid, clf_base_preds)

Accuracy: 88.89
Precision: 0.89
Recall: 0.85
F1: 0.87


In [12]:
# higher n_estimators 1
clf_1 = RandomForestClassifier(n_estimators=200)
clf_1.fit(X_train, y_train)
clf_1_preds = clf_1.predict(X_valid)
clf_1_metrics = eval_classifier_preds(y_valid, clf_1_preds)

Accuracy: 88.89
Precision: 0.89
Recall: 0.85
F1: 0.87


In [13]:
# higher n_estimators 2
clf_2 = RandomForestClassifier(n_estimators=300)
clf_2.fit(X_train, y_train)
clf_2_preds = clf_2.predict(X_valid)
clf_2_metrics = eval_classifier_preds(y_valid, clf_2_preds)

Accuracy: 91.11
Precision: 0.94
Recall: 0.85
F1: 0.89


In [15]:
# higher n_estimators 3
clf_3 = RandomForestClassifier(n_estimators=400)
clf_3.fit(X_train, y_train)
clf_3_preds = clf_3.predict(X_valid)
clf_3_metrics = eval_classifier_preds(y_valid, clf_3_preds)

Accuracy: 91.11
Precision: 0.94
Recall: 0.85
F1: 0.89


In [20]:
# higher max_depth
clf_4 = RandomForestClassifier(n_estimators=300, max_depth=10)
clf_4.fit(X_train, y_train)
clf_4_preds = clf_4.predict(X_valid)
clf_4_metrics = eval_classifier_preds(y_valid, clf_4_preds)

Accuracy: 91.11
Precision: 0.94
Recall: 0.85
F1: 0.89


### Randomized Hyperparameter Tuning
Using **RandomizedSearchCV** (which uses **cross-validation**)

In [38]:
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["log2", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}



In [32]:
hd = pl.read_csv('./heart-disease.csv')
hds = hd.sample(shuffle=True, fraction=1)        # Shuffles data 

In [33]:
X = hds.drop('target', strict=True).to_pandas()
y = hds.get_column('target').to_pandas()

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [39]:
clf = RandomForestClassifier()
rs_clf = RandomizedSearchCV(
    estimator=clf,
    param_distributions=grid,
    n_iter=20,
    cv=10,
    verbose=2
)

In [None]:
# Inputs MUST be converted to numpy!
rs_clf.fit(X_train.to_cupy().get(), y_train.to_cupy().get())

In [42]:
rs_clf.best_params_
# {'n_estimators': 100,
# 'min_samples_split': 2,
# 'min_samples_leaf': 2,
# 'max_features': 'sqrt',
# 'max_depth': 30}

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 30}

In [46]:
tuned_predictions = rs_clf.predict(X_test)
eval_classifier_preds(y_test.to_cupy().get(), tuned_predictions)
# {'accuracy': 0.8, 'precision': 0.81, 'recall': 0.81, 'f1_score': 0.81}    # Not quite better actually!

Accuracy: 80.00
Precision: 0.81
Recall: 0.81
F1: 0.81


{'accuracy': 0.8, 'precision': 0.81, 'recall': 0.81, 'f1_score': 0.81}

### Exhaustive tuning
with **GridSearchCV**.
Differently from RanzomizedSearchCV it goes though **all of the combinations** with cross-validation.

In [48]:
hd = pl.read_csv('./heart-disease.csv')
hds = hd.sample(shuffle=True, fraction=1)        # Shuffles data 

In [49]:
X = hds.drop('target', strict=True).to_pandas()
y = hds.get_column('target').to_pandas()

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [64]:
clf = RandomForestClassifier()
gs_clf = GridSearchCV(   # tries 5400 fits
    estimator=clf,
    param_grid=grid,
    cv=10,
    verbose=2
)

In [65]:
gs_clf.fit(X_train.to_cupy().get(), y_train.to_cupy().get())

Fitting 10 folds for each of 540 candidates, totalling 5400 fits
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_e

In [90]:
best_params = gs_clf.best_params_

In [67]:
gs_predictions = gs_clf.predict(X_test)
eval_classifier_preds(y_test.to_cupy().get(), gs_predictions)

Accuracy: 73.33
Precision: 0.85
Recall: 0.72
F1: 0.78


{'accuracy': 0.73, 'precision': 0.85, 'recall': 0.72, 'f1_score': 0.78}

In [94]:
best_params['n_estimators'] = 200
tuned_rf = RandomForestClassifier(**best_params)
tuned_rf.fit(X_train, y_train)

In [95]:
y_preds = tuned_rf.predict(X_test)

In [96]:
tuned_metrics = eval_classifier_preds(y_test.to_cupy().get(), y_preds)
tuned_metrics

Accuracy: 80.00
Precision: 0.89
Recall: 0.80
F1: 0.84


{'accuracy': 0.8, 'precision': 0.89, 'recall': 0.8, 'f1_score': 0.84}