In [11]:
#!pip install xgboost

In [2]:
import os
import torch


descriptor_dir = "data/ICIAR2018_BACH_Challenge/Photos/A1_brightness_aug_prepost_norm_3norm_effnet_descriptors"

def check_for_nans(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if file is a PyTorch tensor file
            #if file.endswith(".pt"):
            # Load PyTorch tensor from file
            tensor = torch.load(os.path.join(root, file))
            # Check if tensor contains NaN values
            if torch.isnan(tensor).any():
                print(f"File {os.path.join(root, file)} contains NaN values")
                
                
# check_for_nans(f"{descriptor_dir}/Normal")

In [4]:
import random

random.seed(666)

# Generate 25 random numbers between 1 and 100
numbers = [random.randint(1, 100) for _ in range(25)]

# Pad each number with leading zeros to make them all the same length
test_numbers = [str(num).zfill(3) for num in numbers]

print(test_numbers)

['059', '049', '056', '037', '065', '002', '071', '071', '100', '092', '043', '016', '007', '019', '050', '081', '060', '075', '032', '003', '015', '035', '013', '039', '088']


In [12]:
# /data/ICIAR2018_BACH_Challenge/Photos/A1_brightness_aug_prepost_norm_3norm_effnet_descriptors
import numpy as np 
import matplotlib.pyplot as plt
import glob
import cv2
import torch
from pathlib import Path

import os
import seaborn as sns


#Capture training data and labels into respective lists
train_images = []
train_labels = [] 
test_images = []
test_labels = []

labels = {"Invasive":1, "Benign": 0, "Normal": 0, "InSitu":1}

directory_paths = ["data/ICIAR2018_BACH_CHALLENGE/Photos/Normal_descriptors", "data/ICIAR2018_BACH_CHALLENGE/Photos/Benign_descriptors", "data/ICIAR2018_BACH_CHALLENGE/Photos/InSitu_descriptors", "data/ICIAR2018_BACH_CHALLENGE/Photos/Invasive_descriptors"]
directory_paths = [str(Path(p)) for p in directory_paths]
directory_labels = ["Normal", "Benign", "InSitu", "Invasive"]

for directory_path, directory_label in zip(directory_paths, directory_labels):
    for img_path in glob.glob(os.path.join(directory_path, "*")):
        descriptor_tensor = torch.load(img_path).flatten()
        orig_file_name = Path(img_path).name.split("_")[0]
        if any(num in orig_file_name for num in test_numbers): # if part of test set. Have to do it like this because there are 10 augmentations of each img
            if "1.1033333539962769" in img_path: # this should just be a 1 if encodings included a brightness of factor 1 which it should but doesnt
#                 print(img_path)
                test_images.append(descriptor_tensor)
                test_labels.append(labels[directory_label])
        else: # if not part of test set 
            train_images.append(descriptor_tensor)
            train_labels.append(labels[directory_label])

#Convert lists to arrays        
train_images_non_shuffled = torch.stack(train_images, dim=0)
train_labels_non_shuffled = torch.tensor(train_labels)

test_images = torch.stack(test_images, dim=0)
test_labels = torch.tensor(test_labels)


# shuffle
num_samples = len(train_images_non_shuffled)
perm = torch.randperm(num_samples)

train_images = train_images_non_shuffled[perm]
train_labels = train_labels_non_shuffled[perm]

In [39]:
print(f"{train_labels.sum()}/{len(train_labels)}")

1520/3040


In [40]:
#XGBOOST
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(train_images, train_labels) #For sklearn no one hot encoding

#Now predict using the trained RF model. 
prediction = model.predict(test_images)
#Print overall accuracy
from sklearn import metrics
from sklearn.metrics import confusion_matrix
print ("Accuracy = ", metrics.accuracy_score(test_labels, prediction))
print ("F1 = ", metrics.f1_score(test_labels, prediction))
print ("CM: ")
print(confusion_matrix(test_labels, prediction))

Accuracy =  0.9375
F1 =  0.9387755102040817
CM: 
[[44  4]
 [ 2 46]]


# Trying to cut positive class in half in the training set and see what happens

In [33]:
# FIRST HALF OF train_labels_non_shuffled is positive, second half is negative... 
train_images_non_shuffled_onlypos = train_images_non_shuffled[train_labels_non_shuffled==1]
train_labels_non_shuffled_onlypos = train_labels_non_shuffled[train_labels_non_shuffled==1]

train_images_non_shuffled_onlyneg = train_images_non_shuffled[train_labels_non_shuffled==0]
train_labels_non_shuffled_onlyneg = train_labels_non_shuffled[train_labels_non_shuffled==0]

def get_halved(train_images_non_shuffled_onlypos, train_labels_non_shuffled_onlypos):
    new_train_images = []
    new_train_labels = []
    for _, i in enumerate(range(0, len(train_images_non_shuffled_onlypos), 10)):
        if _ % 2 == 0: continue # skip every other one
        # print("slice: [", i-10, ":", i, "]")
        new_train_images += train_images_non_shuffled_onlypos[i-10:i]
        new_train_labels += train_labels_non_shuffled_onlypos[i-10:i]

    new_train_images = torch.stack(new_train_images, dim=0)
    new_train_labels = torch.stack(new_train_labels, dim=0)

    return new_train_images, new_train_labels

a, b = get_halved(train_images_non_shuffled_onlypos, train_labels_non_shuffled_onlypos)

# concat a with train_images_non_shuffled_onlyneg
train_images_pos_halved = torch.cat((a, train_images_non_shuffled_onlyneg), dim=0)
train_labels_pos_halved = torch.cat((b, train_labels_non_shuffled_onlyneg), dim=0)

# shuffle
num_samples = len(train_images_pos_halved)
perm = torch.randperm(num_samples)

train_images_pos_halved = train_images_pos_halved[perm]
train_labels_pos_halved = train_labels_pos_halved[perm]

# halve again 
a, b = get_halved(a, b)
train_images_pos_quartered = torch.cat((a, train_images_non_shuffled_onlyneg), dim=0)
train_labels_pos_quartered = torch.cat((b, train_labels_non_shuffled_onlyneg), dim=0)

print(f"{train_labels_pos_halved.sum()}/{len(train_labels_pos_halved)}")     
print(f"{train_labels_pos_quartered.sum()}/{len(train_labels_pos_quartered)}")     

# shuffle
num_samples = len(train_images_pos_quartered)
perm = torch.randperm(num_samples)

train_images_pos_quartered = train_images_pos_quartered[perm]
train_labels_pos_quartered = train_labels_pos_quartered[perm]


760/2280
380/1900


In [34]:
#XGBOOST
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(train_images_pos_halved, train_labels_pos_halved) #For sklearn no one hot encoding

#Now predict using the trained RF model. 
prediction = model.predict(test_images)
#Print overall accuracy
from sklearn import metrics
from sklearn.metrics import confusion_matrix
print ("Accuracy = ", metrics.accuracy_score(test_labels, prediction))
print ("F1 = ", metrics.f1_score(test_labels, prediction))
print ("CM: ")
print(confusion_matrix(test_labels, prediction))

Accuracy =  0.8958333333333334
F1 =  0.8913043478260869
CM: 
[[45  3]
 [ 7 41]]


In [35]:
#XGBOOST
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(train_images_pos_quartered, train_labels_pos_quartered) #For sklearn no one hot encoding

#Now predict using the trained RF model. 
prediction = model.predict(test_images)
#Print overall accuracy
from sklearn import metrics
from sklearn.metrics import confusion_matrix
print ("Accuracy = ", metrics.accuracy_score(test_labels, prediction))
print ("F1 = ", metrics.f1_score(test_labels, prediction))
print ("CM: ")
print(confusion_matrix(test_labels, prediction))

Accuracy =  0.8229166666666666
F1 =  0.7951807228915663
CM: 
[[46  2]
 [15 33]]
