In [1]:
# RandomForest optimisation test using gridsearchcv
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

In [2]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [29]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print(cross_val_score(model, X, y, cv=StratifiedKFold(n_splits=5, shuffle=False)))
#print(classification_report(y_test, predictions))

[0.92982456 0.94736842 0.98245614 0.97368421 0.96460177]


In [30]:
print(cross_val_score(model, X, y, cv=5))

[0.92982456 0.94736842 0.98245614 0.97368421 0.96460177]


In [7]:
param_grid = {
    'n_estimators': range(50, 101, 25),
    'criterion': ['gini','entropy'],
    'min_samples_split': range(1, 11, 1),
    'min_samples_leaf': range(1, 11, 1),
    'max_features': ['sqrt', 'log2'],
    'random_state': [1]
}

grid = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

grid_pred = grid.predict(X_test)
print(classification_report(y_test, grid_pred))

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
{'criterion': 'gini', 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 50, 'random_state': 1}
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        42
           1       0.94      1.00      0.97        72

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



