### Random Forest Exercise

------------------

In [1]:
# import pandas
import pandas as pd

In [2]:
# list for column headers
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# load data
df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", names=names)

Spend some time to explore the dataset.
- head
- shape

In [3]:
# Display the first few rows of the dataset
print(df.head())

# Check the shape of the dataset (number of rows and columns)
print(df.shape)

   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1
(768, 9)


* create the X and y (the goal is to predict column **class** based on other variables)

In [4]:
# Features (all columns except 'class')
X = df.drop(columns=['class'])

# Target (the 'class' column)
y = df['class']


* split data set into a train set and test set

In [5]:
from sklearn.model_selection import train_test_split

# Split the dataset into a training set and a testing set
# Typically, 70-80% of the data is used for training, and the rest for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting datasets
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(614, 8) (154, 8)
(614,) (154,)


------------------------
#### Part 1: Setting up the Random Forest Classifier
* import RandomForestClassifier from sklearn. It is suggested to spend some time on the doccumentation of this classifier to get familiar with the available parameters.

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
# For now, let's use default parameters
rf_classifier = RandomForestClassifier(random_state=42)

* create model

In [7]:
# Create the Random Forest Classifier model
rf_classifier = RandomForestClassifier(random_state=42)

# Fit the model using the training data
rf_classifier.fit(X_train, y_train)

* fit training set with default parameters

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier with default parameters
rf_classifier = RandomForestClassifier(random_state=42)

# Fit the model to the training data
rf_classifier.fit(X_train, y_train)

* predict X_test

In [9]:
# Predict the labels for the test set
y_pred = rf_classifier.predict(X_test)

# Display the predicted values
print(y_pred)

[0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 1
 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0
 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 0 0 1 0 1 0
 1 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0]


* import roc_auc_score and confusion_matrix from sklearn

In [10]:
from sklearn.metrics import roc_auc_score, confusion_matrix

* print confusion matrix

In [11]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print(cm)

[[77 22]
 [21 34]]


* print AUC

In [12]:
# Calculate the AUC score
auc = roc_auc_score(y_test, y_pred)

# Print the AUC score
print("AUC Score:", auc)

AUC Score: 0.697979797979798


----------------------------------
#### Part 2: Using a Grid Search
- import GridSearchCV from sklearn

In [13]:
from sklearn.model_selection import GridSearchCV

* create grid (optimize for number of trees and max depth in one tree)

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30]  # Maximum depth of each tree
}

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Set up GridSearchCV with the Random Forest Classifier and the parameter grid
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'max_depth': None, 'n_estimators': 50}
Best Score: 0.7769025723044115


* fit training data with grid search

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30]  # Maximum depth of each tree
}

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Set up GridSearchCV with the Random Forest Classifier and the parameter grid
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'n_estimators': 50}
Best Score: 0.7769025723044115


* print confusion matrix with the best model

In [16]:
from sklearn.metrics import confusion_matrix

# Retrieve the best model from the grid search
best_model = grid_search.best_estimator_

# Predict the labels for the test set using the best model
y_pred_best = best_model.predict(X_test)

# Generate the confusion matrix
cm_best = confusion_matrix(y_test, y_pred_best)

# Print the confusion matrix
print("Confusion Matrix with Best Model:")
print(cm_best)

Confusion Matrix with Best Model:
[[74 25]
 [20 35]]


* print AUC with the best model

In [17]:
from sklearn.metrics import roc_auc_score

# Retrieve the best model from the grid search
best_model = grid_search.best_estimator_

# Get the probability estimates for the positive class
y_prob_best = best_model.predict_proba(X_test)[:, 1]

# Calculate the AUC score
auc_best = roc_auc_score(y_test, y_prob_best)

# Print the AUC score
print("AUC Score with Best Model:", auc_best)

AUC Score with Best Model: 0.7981634527089072


- is the model better than default?

In [18]:
# Initialize and fit the default Random Forest Classifier
default_model = RandomForestClassifier(random_state=42)
default_model.fit(X_train, y_train)

# Predict using the default model
y_pred_default = default_model.predict(X_test)
y_prob_default = default_model.predict_proba(X_test)[:, 1]

# Compute metrics for the default model
cm_default = confusion_matrix(y_test, y_pred_default)
auc_default = roc_auc_score(y_test, y_prob_default)

# Print metrics for the default model
print("Confusion Matrix with Default Model:")
print(cm_default)
print("AUC Score with Default Model:", auc_default)

# Retrieve the best model from the grid search
best_model = grid_search.best_estimator_

# Predict using the best model
y_pred_best = best_model.predict(X_test)
y_prob_best = best_model.predict_proba(X_test)[:, 1]

# Compute metrics for the best model
cm_best = confusion_matrix(y_test, y_pred_best)
auc_best = roc_auc_score(y_test, y_prob_best)

# Print metrics for the best model
print("Confusion Matrix with Best Model:")
print(cm_best)
print("AUC Score with Best Model:", auc_best)

# Compare AUC scores
if auc_best > auc_default:
    print("The best model has a higher AUC score than the default model.")
else:
    print("The default model has a higher AUC score than the best model.")

Confusion Matrix with Default Model:
[[77 22]
 [21 34]]
AUC Score with Default Model: 0.8120293847566575
Confusion Matrix with Best Model:
[[74 25]
 [20 35]]
AUC Score with Best Model: 0.7981634527089072
The default model has a higher AUC score than the best model.
