# Random Forest Classification

## Importing the libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from scipy.stats import randint, uniform

In [3]:
# Load the dataset
data = pd.read_csv('cancer_data.csv')

In [5]:
# Step 1: Remove the ID column
#data.drop(columns=['ID'], inplace=True)

# Step 2: Encode the Diagnosis column (M = 1, B = 0)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

In [7]:
# Step 3: Choose Diagnosis as the target variable
X = data.drop(columns=['diagnosis'])  # Features
y = data['diagnosis']  # Target

In [9]:
# Step 4: Split the data into train (80%), test (15%), and validation (5%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 * 20% = 5%

In [11]:
# Step 5: Hyperparameter Tuning for Random Forest
# Define the parameter distribution for Randomized Search
param_dist = {
    'n_estimators': randint(50, 500),       # Number of trees in the forest
    'max_depth': [None] + list(range(5, 50)),  # Maximum depth of the tree
    'min_samples_split': randint(2, 20),   # Minimum samples required to split a node
    'min_samples_leaf': randint(1, 20),   # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2'],      # Number of features to consider at each split
    'bootstrap': [True, False]             # Whether to use bootstrap sampling
}

In [13]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

In [15]:
# Set up Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(
    estimator=rf_model,              # Base model
    param_distributions=param_dist,  # Parameter distribution to sample from
    n_iter=100,                      # Number of parameter settings to sample
    scoring='accuracy',              # Evaluation metric
    cv=10,                          # 10-fold cross-validation
    verbose=1,                      # Print progress
    n_jobs=-1,                      # Use all available CPU cores
    random_state=42                 # Seed for reproducibility
)

In [17]:
# Fit Randomized Search on the training data
random_search.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [18]:
# Retrieve the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)

Best Parameters: {'bootstrap': True, 'max_depth': 38, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 305}
Best Cross-Validation Accuracy: 0.9670048309178745


In [19]:
# Step 6: Train the final model with the best parameters
final_model = random_search.best_estimator_
final_model.fit(X_train, y_train)

In [20]:
# Step 7: Evaluate the model on the test set
y_test_pred = final_model.predict(X_test)
y_test_pred_prob = final_model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

In [21]:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)

print("\nTest Set Performance:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")
print(f"ROC-AUC: {test_roc_auc:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Test Set Performance:
Accuracy: 0.9529
Precision: 0.9688
Recall: 0.9118
F1-Score: 0.9394
ROC-AUC: 0.9971
Confusion Matrix:
 [[50  1]
 [ 3 31]]


In [22]:
# Step 8: Predict on the unseen validation set
y_val_pred = final_model.predict(X_val)
y_val_pred_prob = final_model.predict_proba(X_val)[:, 1]

val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_roc_auc = roc_auc_score(y_val, y_val_pred_prob)

print("\nValidation Set Performance:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1-Score: {val_f1:.4f}")
print(f"ROC-AUC: {val_roc_auc:.4f}")



Validation Set Performance:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
ROC-AUC: 1.0000
Confusion Matrix:
 [[20  0]
 [ 0  9]]


In [23]:
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

Confusion Matrix:
 [[20  0]
 [ 0  9]]
