In [4]:
import numpy as np
from numpy import mean, std
import pandas as pd
from sklearn.metrics import accuracy_score, make_scorer, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC # for Support Vector Classification model
import plotly.express as px  # for data visualization
import plotly.graph_objects as go # for data visualization


import matplotlib.pyplot as plt
import seaborn as sns


# Tensorflow
import tensorflow as tf
import tensorflow_datasets as tfds



print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
import time

  from .autonotebook import tqdm as notebook_tqdm


Num GPUs Available:  1


In [1]:
def convert_sample(image):
    #image = tf.image.rgb_to_grayscale(image)
    image = tf.image.resize(image,[64,64]).numpy()
    image = image.reshape(1,-1)
    return image
    24

In [2]:
data_dir = r'C:\Users\andly'
print('Current data dir '+data_dir)

Current data dir C:\Users\andly


In [7]:
tf.random.set_seed(88)
ds1,ds2,ds3 = tfds.load('patch_camelyon',
                    split=['train[:5%]','test[:5%]','validation[:5%]'],
                    data_dir = data_dir,
                    download=False,
                    batch_size=-1, # All data...no batches needed 
                    as_supervised=True, # So that we easily can transform data to numpy format
                    shuffle_files=True)
print('Done Loading Data')



Done Loading Data


In [8]:

train_dataset = tfds.as_numpy(ds1) # FULL DATA
train_dataset_image = np.vstack(list(map(convert_sample,train_dataset[0]))) # <-- This is the X
train_dataset_image_Scaled = StandardScaler(with_mean=0, with_std=1).fit_transform(train_dataset_image)
train_dataset_label = train_dataset[1].reshape(-1,) # <-- This is y   
print(f'Shape of training data features (observations,features): {train_dataset_image_Scaled.shape}')
print(f'Shape of training data labels (observations,): {train_dataset_label.shape}')

validation_dataset = tfds.as_numpy(ds3)
validation_dataset_image = np.vstack(list(map(convert_sample,validation_dataset[0])))
validation_dataset_image_Scaled = StandardScaler(with_mean=0, with_std=1).fit_transform(validation_dataset_image)
validation_dataset_label = validation_dataset[1].reshape(-1,) 

test_dataset = tfds.as_numpy(ds2)
test_dataset_image = np.vstack(list(map(convert_sample,test_dataset[0]))) # <-- X_test
test_dataset_image_Scaled = StandardScaler(with_mean=0, with_std=1).fit_transform(test_dataset_image)
test_dataset_label = test_dataset[1].reshape(-1,)
print("Done spliting data")



Shape of training data features (observations,features): (13107, 12288)
Shape of training data labels (observations,): (13107,)




Done spliting data


In [8]:



# SVM Estimator

# initialize the SVM model
svm = SVC()

# define the parameter grid for grid search
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
# perform grid search using the SVM model and the parameter grid
grid_search = GridSearchCV(svm, param_grid, cv=5, verbose=4)
#grid_search.fit(train_dataset_image_Scaled, train_dataset_label)

# print the best parameters
#print("Best parameters: ", grid_search.best_params_)
  



In [17]:
svm_tuned = SVC(C=0.1, kernel='rbf')

svm_tuned.fit(train_dataset_image_Scaled, train_dataset_label)

# make predictions on the training set
y_train_pred = svm_tuned.predict(train_dataset_image_Scaled)

# Compute the accuracy on the training set
train_accuracy = accuracy_score(train_dataset_label, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Compute the precision on the training set
train_precision = precision_score(train_dataset_label, y_train_pred)
print("Training Precision:", train_precision)

# Compute the recall on the training set
train_recall = recall_score(train_dataset_label, y_train_pred)
print("Training Recall:", train_recall)

# Compute the F1-score on the training set
train_f1 = f1_score(train_dataset_label, y_train_pred)
print("Training F1-score:", train_f1)





# Make predictions on the validation set
y_val_pred = svm_tuned.predict(validation_dataset_image_Scaled)

# Compute the accuracy on the validation set
val_accuracy = accuracy_score(validation_dataset_label, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# Compute the precision on the validation set
val_precision = precision_score(validation_dataset_label, y_val_pred)
print("Validation Precision:", val_precision)

# Compute the recall on the validation set
val_recall = recall_score(validation_dataset_label, y_val_pred)
print("Validation Recall:", val_recall)

# Compute the F1-score on the validation set
val_f1 = f1_score(validation_dataset_label, y_val_pred)
print("Validation F1-score:", val_f1)

# Make predictions on the test set
y_test_pred = svm_tuned.predict(test_dataset_image_Scaled)

# Compute the accuracy on the test set
test_accuracy = accuracy_score(test_dataset_label, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Compute the precision on the test set
test_precision = precision_score(test_dataset_label, y_test_pred)
print("Test Precision:", test_precision)

# Compute the recall on the test set
test_recall = recall_score(test_dataset_label, y_test_pred)
print("Test Recall:", test_recall)

# Compute the F1-score on the test set
test_f1 = f1_score(test_dataset_label, y_test_pred)
print("Test F1-score:", test_f1)

Training Accuracy: 0.7878995956359197
Training Precision: 0.7892714417460805
Training Recall: 0.7846882640586798
Training F1-score: 0.7869731800766284
Validation Accuracy: 0.7545787545787546
Validation Precision: 0.7783641160949868
Validation Recall: 0.7160194174757282
Validation F1-score: 0.7458912768647282
Test Accuracy: 0.7527472527472527
Test Precision: 0.7759103641456583
Test Recall: 0.6933667083854819
Test F1-score: 0.7323198942498348


In [23]:
# Get the training and validation accuracy
train_acc = svm_tuned.score(train_dataset_image_Scaled, train_dataset_label)
val_acc = svm_tuned.score(validation_dataset_label, validation_dataset_label)


KeyboardInterrupt: 

In [22]:
train_acc.reshape(1, -1)
val_acc.reshape(1, -1)

# Plot the training and validation accuracy
plt.plot([0, 1], [train_acc, val_acc], label=['Training Accuracy', 'Validation Accuracy'])
plt.xlabel('Data')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

NameError: name 'val_acc' is not defined

In [9]:
# XGBOOST Estimator

xgb_tuned = XGBClassifier(tree_method="gpu_hist", gpu_id=0)

# Define the parameter grid for the grid search
param_grid = {
    'max_depth': [1, 2, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 500],
    
}

# Initialize the XGBClassifier


#Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=xgb_tuned, param_grid=param_grid, cv=3, verbose=4, return_train_score=True)

#Fit the GridSearchCV to the training data
grid_search.fit(train_dataset_image_Scaled, train_dataset_label)


# xgb_hyper = XGBClassifier(tree_method="gpu_hist", gpu_id=0, learning_rate=0.1, max_depth=5, n_estimators=300)

# xgb_hyper.fit(train_dataset_image_Scaled, train_dataset_label)
# Print the best parameters
#print("Best parameters:", grid_search.best_params_)

# Make predictions on the validation set
y_val_pred = xgb_hyper.predict(validation_dataset_image_Scaled)

# Compute the accuracy on the validation set
val_accuracy = accuracy_score(validation_dataset_label, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# Compute the precision on the validation set
val_precision = precision_score(validation_dataset_label, y_val_pred)
print("Validation Precision:", val_precision)

# Compute the recall on the validation set
val_recall = recall_score(validation_dataset_label, y_val_pred)
print("Validation Recall:", val_recall)

# Compute the F1-score on the validation set
val_f1 = f1_score(validation_dataset_label, y_val_pred)
print("Validation F1-score:", val_f1)

# Make predictions on the test set
y_test_pred = xgb_hyper.predict(test_dataset_image_Scaled)

# Compute the accuracy on the test set
test_accuracy = accuracy_score(test_dataset_label, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Compute the precision on the test set
test_precision = precision_score(test_dataset_label, y_test_pred)
print("Test Precision:", test_precision)

# Compute the recall on the test set
test_recall = recall_score(test_dataset_label, y_test_pred)
print("Test Recall:", test_recall)

# Compute the F1-score on the test set
test_f1 = f1_score(test_dataset_label, y_test_pred)
print("Test F1-score:", test_f1)



NameError: name 'XGBClassifier' is not defined

In [12]:
# Make predictions on the training set
y_train_pred = xgb_hyper.predict(train_dataset_image_Scaled)
# Compute the accuracy on the training set
train_accuracy = accuracy_score(train_dataset_label, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Compute the precision on the training set
train_precision = precision_score(train_dataset_label, y_train_pred)
print("Training Precision:", train_precision)

# Compute the recall on the training set
train_recall = recall_score(train_dataset_label, y_train_pred)
print("Training Recall:", train_recall)

# Compute the F1-score on the training set
train_f1 = f1_score(train_dataset_label, y_train_pred)
print("Training F1-score:", train_f1)

Training Accuracy: 0.9965667200732433
Training Precision: 0.9964858670741024
Training Recall: 0.996638141809291
Training F1-score: 0.9965619986247994


In [18]:
# Ensemble Learning with non-deep learning

import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# # Initialize the classifiers
# svm = SVC(kernel='rbf', C=1)
# rf = RandomForestClassifier(n_estimators=300)
# xgb = XGBClassifier(tree_method="gpu_hist", gpu_id=0, max_depth=5, objective='binary:logistic', n_estimators=300, eta=0.05)
# gb = GradientBoostingClassifier(n_estimators=300)

# Initialize the classifiers
rf = RandomForestClassifier(n_estimators=300)
xgb = XGBClassifier(tree_method="gpu_hist", gpu_id=0, max_depth=5, n_estimators=360, eta=0.05)
gb = GradientBoostingClassifier(n_estimators=300)


# Create the ensemble model
ensemble = StackingClassifier(estimators=[('rf', rf), ('xgb', xgb), ('gb',gb)], final_estimator=LogisticRegression())

# Fit the ensemble model to the training data
ensemble.fit(train_dataset_image_Scaled, train_dataset_label)





In [19]:
# Make predictions on the test data
y_train_pred = ensemble.predict(train_dataset_image_Scaled)
# Compute the accuracy on the training set
train_accuracy = accuracy_score(train_dataset_label, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Compute the precision on the training set
train_precision = precision_score(train_dataset_label, y_train_pred)
print("Training Precision:", train_precision)

# Compute the recall on the training set
train_recall = recall_score(train_dataset_label, y_train_pred)
print("Training Recall:", train_recall)

# Compute the F1-score on the training set
train_f1 = f1_score(train_dataset_label, y_train_pred)
print("Training F1-score:", train_f1)


# Make predictions on the validation set
y_val_pred = ensemble.predict(validation_dataset_image_Scaled)

# Compute the accuracy on the validation set
val_accuracy = accuracy_score(validation_dataset_label, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# Compute the precision on the validation set
val_precision = precision_score(validation_dataset_label, y_val_pred)
print("Validation Precision:", val_precision)

# Compute the recall on the validation set
val_recall = recall_score(validation_dataset_label, y_val_pred)
print("Validation Recall:", val_recall)

# Compute the F1-score on the validation set
val_f1 = f1_score(validation_dataset_label, y_val_pred)
print("Validation F1-score:", val_f1)

# Make predictions on the test set
y_test_pred = ensemble.predict(test_dataset_image_Scaled)

# Compute the accuracy on the test set
test_accuracy = accuracy_score(test_dataset_label, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Compute the precision on the test set
test_precision = precision_score(test_dataset_label, y_test_pred)
print("Test Precision:", test_precision)

# Compute the recall on the test set
test_recall = recall_score(test_dataset_label, y_test_pred)
print("Test Recall:", test_recall)

# Compute the F1-score on the test set
test_f1 = f1_score(test_dataset_label, y_test_pred)
print("Test F1-score:", test_f1)

Training Accuracy: 0.9770351720454719
Training Precision: 0.9721675994554531
Training Recall: 0.9821210268948656
Training F1-score: 0.9771189661725579
Validation Accuracy: 0.7741147741147741
Validation Precision: 0.8026666666666666
Validation Recall: 0.7305825242718447
Validation F1-score: 0.7649301143583227
Test Accuracy: 0.7667887667887668
Test Precision: 0.7697283311772316
Test Recall: 0.7446808510638298
Test F1-score: 0.7569974554707378


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Initialize the RandomForestClassifier
rf = RandomForestClassifier()

# Initialize the GridSearchCV
grid_search2 = GridSearchCV(rf, param_grid, cv=3)

# Fit the GridSearchCV to the training data
#grid_search2.fit(train_dataset_image_Scaled, train_dataset_label)

# Print the best parameters
#print("Best parameters:", grid_search2.best_params_)



In [16]:
# New rf with hyperparameters found
rf_hyper = RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200)


rf.fit(train_dataset_image_Scaled, train_dataset_label)
# Make predictions on the training set
y_train_pred = rf.predict(train_dataset_image_Scaled)

# Compute the accuracy on the training set
train_accuracy = accuracy_score(train_dataset_label, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Compute the precision on the training set
train_precision = precision_score(train_dataset_label, y_train_pred)
print("Training Precision:", train_precision)

# Compute the recall on the training set
train_recall = recall_score(train_dataset_label, y_train_pred)
print("Training Recall:", train_recall)

# Compute the F1-score on the training set
train_f1 = f1_score(train_dataset_label, y_train_pred)
print("Training F1-score:", train_f1)

# Make predictions on the validation set
y_val_pred = rf.predict(validation_dataset_image_Scaled)

# Compute the accuracy on the validation set
val_accuracy = accuracy_score(validation_dataset_label, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# Compute the precision on the validation set
val_precision = precision_score(validation_dataset_label, y_val_pred)
print("Validation Precision:", val_precision)

# Compute the recall on the validation set
val_recall = recall_score(validation_dataset_label, y_val_pred)
print("Validation Recall:", val_recall)

# Compute the F1-score on the test set
val_f1 = f1_score(validation_dataset_label, y_test_pred)
print("Val F1-score:", val_f1)

# Make predictions on the test set
y_test_pred = rf.predict(test_dataset_image_Scaled)

# Compute the accuracy on the test set
test_accuracy = accuracy_score(test_dataset_label, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Compute the precision on the test set
test_precision = precision_score(test_dataset_label, y_test_pred)
print("Test Precision:", test_precision)

# Compute the recall on the test set
test_recall = recall_score(test_dataset_label, y_test_pred)
print("Test Recall:", test_recall)

# Compute the F1-score on the test set
test_f1 = f1_score(test_dataset_label, y_test_pred)
print("Test F1-score:", test_f1)

Training Accuracy: 1.0
Training Precision: 1.0
Training Recall: 1.0
Training F1-score: 1.0
Validation Accuracy: 0.7509157509157509
Validation Precision: 0.7849315068493151
Validation Recall: 0.6953883495145631
Val F1-score: 0.48966408268733846
Test Accuracy: 0.7527472527472527
Test Precision: 0.7743732590529248
Test Recall: 0.6958698372966208
Test F1-score: 0.7330257086354647
