In [None]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np

In [None]:
# Read in data and display first 5 rows
data = pd.read_csv(r'C:\Users\Gebruiker\Desktop\Desktop_Ghent\Notebooks\datasets\BC_data.csv',sep=";",header=0, index_col=0)
data.head()

In [None]:
print('The shape of our data is:', data.shape)


In [None]:
# Descriptive statistics for each column
data.describe()

In [None]:
# Labels are the values we want to predict
labels = data.Class
labels.describe()

In [None]:
# Labels are the values we want to predict
labels = np.array(data.Class)
labels

In [None]:
# Remove the labels from the data axis 1 refers to the columns
data = data.drop('Class', axis = 1)

In [None]:
# Saving feature names for later use
feature_list = list(data.columns)
feature_list

In [None]:
# Convert to numpy array
data = np.array(data)
data

In [None]:
# Using Scikit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Data Shape:', train_data.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Data Shape:', test_data.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees
clf = RandomForestClassifier(n_estimators = 1000, random_state = 42)

In [None]:
#Train the model using the training sets y_pred=clf.predict(test_data)
clf.fit(train_data,train_labels)

In [None]:
y_pred=clf.predict(test_data)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_labels, y_pred))

Finding Important Features in Scikit-learn

In [None]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_data,train_labels)

In [None]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=feature_list).sort_values(ascending=False)
feature_imp

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

Generating the Model on Selected Features

In [None]:
feature_imp

In [None]:
feature_list

In [None]:
data = pd.read_csv(r'C:\Users\Gebruiker\Desktop\Desktop_Ghent\Notebooks\datasets\BC_data.csv',sep=";",header=0, index_col=0)

# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into first 6 features and labels
X=data[['16-epiestriol', 'estriol','4-methoxy_estradiol','4-methoxy_estrone','testosterone','DHEA_sulphate']]
y=data['Class']                             
          
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.70, random_state=5) # 70% training and 30% test

In [None]:
# After spliting, you will generate a model on the selected training set features, 
# perform predictions on the selected test set features, and compare actual and predicted values.
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

# prediction on test set
y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#### Hyperparameter tuning

In [None]:
from pprint import pprint

In [None]:
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(random_state = 42)

In [None]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(clf.get_params())

The most important settings are:
 - the number of trees in the forest (n_estimators);
 - the number of features considered for splitting at each leaf node (max_features).

Btw, the available hyperparameters are:
 - n_estimators = number of trees in the foreset
 - max_features = max number of features considered for splitting a node
 - max_depth = max number of levels in each decision tree
 - min_samples_split = min number of data points placed in a node before the node is split
 - min_samples_leaf = min number of data points allowed in a leaf node
 - bootstrap = method for sampling data points (with or without replacement)

In [None]:
#Random Hyperparameter Grid
# To use RandomizedSearchCV, we first need to create a parameter grid to sample from during fitting:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
#Random Search Training (APPROX 2-5 MINUTES)
#Now, we instantiate the random search and fit it like any Scikit-Learn model:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
clf = RandomForestClassifier(random_state=42)
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, random_state=42, n_jobs = -1)
# Fit the random search model
clf_random.fit(X_train, y_train)

The most important arguments in RandomizedSearchCV are *n_iter*, which controls *the number of different combinations to try*, and *cv* which is the number of *folds to use for cross validation* (we use 100 and 3 respectively). 

More iterations will cover a wider search space and more cv folds reduces the chances of overfitting, but raising each will increase the run time. Machine learning is a field of trade-offs, and performance vs time is one of the most fundamental.

In [None]:
#We can view the best parameters from fitting the random search:
clf_random.best_params_

In [None]:
#Evaluate Random Search
#To determine if random search yielded a better model, we compare the base model with the best random search model.
def evaluate(model, training, y_training, testing, y_testing):
    model.fit(training,y_training)
    y_pred = model.predict(testing)
    accuracy = metrics.accuracy_score(y_testing, y_pred)
    print('Accuracy = {:0.4f}%.'.format(accuracy)) 

    return accuracy

In [None]:
base_model = RandomForestClassifier(n_estimators = 1, random_state = 42)
base_accuracy = evaluate(base_model, X_train, y_train, X_test, y_test)

In [None]:
# View best model
best_random = clf_random.best_estimator_
random_accuracy = evaluate(best_random, X_train, y_train, X_test, y_test)

In [None]:
# Print the improvement
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

#### Grid Search with Cross Validation

Random search allowed us to narrow down the range for each hyperparameter - remember to view the optimal results!

Now that we know where to concentrate our search, we can explicitly specify every combination of settings to try. We do this with GridSearchCV, a method that, instead of sampling randomly from a distribution, evaluates all combinations we define.

In [None]:
clf_random.best_estimator_

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40],
    'max_features': ['sqrt'],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 100, 1000, 2000, 4000]
}
# Create a based model
clf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                           cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data (approx 2-4 minutes)
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid,X_train, y_train, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Build the final random forest model

In [None]:
grid_search.best_estimator_

In [None]:
from sklearn import model_selection
# random forest model creation
clf = grid_search.best_estimator_
clf.fit(X_train,y_train)
# predictions
clf_predict = clf.predict(X_test)

In [None]:
# Evaluate the performance
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring='roc_auc')

In [None]:
# Print the results
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, clf_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, clf_predict))
print('\n')
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", clf_cv_score.mean())

In [None]:
# Extract single tree
estimator = clf.estimators_[5]

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

fig = plt.figure(figsize=(15, 10))
plot_tree(clf.estimators_[5], 
          feature_names = ['16-epiestriol', 'estriol','4-methoxy_estradiol','4-methoxy_estrone','testosterone','DHEA_sulphate'],
                class_names = y_train, 
          filled=True, impurity=True, 
          rounded=True)

In [None]:
#pip install dtreeviz

In [None]:
from dtreeviz.trees import dtreeviz

In [None]:
from sklearn.tree import export_text

print(export_text(clf.estimators_[5], 
                  spacing=3, decimals=3,
                  feature_names = ['16-epiestriol', 'estriol','4-methoxy_estradiol','4-methoxy_estrone','testosterone','DHEA_sulphate'],
          ))