In [3]:
# directory handling 
import os
import glob
import shutil
import pathlib

# assuming the data is in a subdirectory called "data" and fully unzipped
# unzip the data.zip file in the data folder
# if you are using the data.zip file from the moodle, you can use the following code to unzip it

# import zipfile
# with zipfile.ZipFile('../data/data.zip', 'r') as zip_ref:
#     zip_ref.extractall('../data')

# create a directory for the data if it does not exist
pathlib.Path('data').mkdir(parents=True, exist_ok=True)

# create folder for ussage in the notebook
pathlib.Path('data/abgabe_dir').mkdir(parents=True, exist_ok=True)

train_dir = '../data/abgabe_dir/train'
validation_dir = '../data/abgabe_dir/validation'
test_dir = '../data/abgabe_dir/test'

all_data_in_one_folder = '../data/abgabe_dir/all_pictures'

# copy the data to the new folder if data is not already there
if(os.path.exists(all_data_in_one_folder) == False):
    shutil.copytree('../data/train', all_data_in_one_folder, dirs_exist_ok=True)
    shutil.copytree('../data/test', all_data_in_one_folder, dirs_exist_ok=True)

# if(os.path.exists(train_dir) == False):
#     shutil.copytree('../data/train', train_dir)

# if(os.path.exists(validation_dir) == False):
#     shutil.copytree('../data/test', validation_dir)

# if(os.path.exists(test_dir) == False):
#     shutil.copytree('../data/test', test_dir)

# copy 1000 from all_data_in_one_folder/bengin to validation_dir/bengin


def create_abgabe_path(label, size_of_train, size_of_validation, size_of_test):
    train_dir_label = f"../data/abgabe_dir/train/{label}"
    validation_dir_label = f"../data/abgabe_dir/validation/{label}"
    test_dir_label = f"../data/abgabe_dir/test/{label}"

    src_dir = f"../data/abgabe_dir/all_pictures/{label}"

    # if not existst, create the folder
    pathlib.Path(train_dir_label).mkdir(parents=True, exist_ok=True)
    pathlib.Path(validation_dir_label).mkdir(parents=True, exist_ok=True)
    pathlib.Path(test_dir_label).mkdir(parents=True, exist_ok=True)


    fnames = ['{}.jpg'.format(i) for i in range(1,size_of_train)]
    for fname in fnames:
        src = src_dir + "/" + fname
        dst = train_dir_label + "/" + fname
    
        try:
            shutil.copyfile(src, dst)
        except:
            print("File not found: " + src)
            

    fnames = ['{}.jpg'.format(i) for i in range(size_of_train, size_of_train+size_of_validation)]
    for fname in fnames:
        src = src_dir + "/" + fname
        dst = validation_dir_label + "/" + fname    
        
        try:
            shutil.copyfile(src, dst)
        except:
            print("File not found: " + src)
            

    fnames = ['{}.jpg'.format(i) for i in range(size_of_train + size_of_validation, size_of_train + size_of_validation + size_of_test)]
    for fname in fnames:
        src = src_dir + "/" + fname
        dst = test_dir_label + "/" + fname

        try:
            shutil.copyfile(src, dst)
        except:
            print("File not found: " + src)
            

create_abgabe_path("benign", size_of_train=1000, size_of_validation=250, size_of_test=250)
create_abgabe_path("malignant", size_of_train=1000, size_of_validation=250, size_of_test=250)






File not found: ../data/abgabe_dir/all_pictures/malignant/767.jpg
File not found: ../data/abgabe_dir/all_pictures/malignant/776.jpg
File not found: ../data/abgabe_dir/all_pictures/malignant/788.jpg


In [4]:
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras.applications import VGG16


def build_basic_model(dropout=0):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu',
                            input_shape=(224, 224, 3)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    # dropout 
    if dropout != 0:
        model.add(layers.Dropout(dropout))
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))


    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.RMSprop(lr=1e-4),
                  metrics=['acc'])
    return model

# test_plan2[7] = {"learning rate": 2e-5, "dropout": 0, "weight regularization": 1e-1}

def build_model_pretrained(n_units, learning_rate, weights=0, dropout=0):
    

    conv_base = VGG16(weights='imagenet',
                      include_top=False,
                      input_shape=(224, 224, 3))
    conv_base.trainable = False

    model = models.Sequential()
    model.add(conv_base)
    model.add(layers.Flatten())
    
    if(weights > 0):
        model.add(layers.Dense(n_units, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=weights, l2=weights)))
    else:
        model.add(layers.Dense(n_units, activation='relu'))
    
    if(dropout > 0):
        model.add(layers.Dropout(dropout))
    
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.RMSprop(lr=learning_rate),
                  metrics=['acc'])
            
    return model

In [8]:
# richtiges coding


from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator


model = build_basic_model()

# All images will be [0,1] standardized
train_datagen = ImageDataGenerator(rescale=1.0/255)

k_fold_dir = "../data/abgabe_dir/all_pictures/"
training_generator = train_datagen.flow_from_directory(
    # This is the target directory
    k_fold_dir,
    # All images will be resized to 150x150
    target_size=(224, 224),
    batch_size=1,
    # Since binary_crossentropy loss is used, binary labels are needed
    class_mode='binary',
    subset='training')


training_data = []

# malignant
max_value = (len(os.listdir(k_fold_dir+"/benign")) + len(os.listdir(k_fold_dir+"/malignant")))
print("the number of pictures in this dic is:",max_value)
for index, x in enumerate(training_generator):
    
    # safe label info as integer
    if(1 in x[1] ):
        temp_to_add = 1
    else:
        temp_to_add = 0

    # ingore the batch size and only take the first element of a list with one element
    for i in x[0]:
        training_data.append([i, temp_to_add])

    if(index % max_value == 0 and index != 0):
        break
    if(index % 100 == 0):
        print(index)

Found 3297 images belonging to 2 classes.
the number of pictures in this dic is: 3297
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200


2

In [12]:
import matplotlib.pyplot as plt

len(training_data[3200])

# ebene 1 is the number of pictures 
# ebene 2 is malignant or benign
# ebene 3 is the batch size - this makes problems
# ebene 4 is the picture


# plt.imshow(training_data[1997])

# shows if malignant or benign
# is one
print(training_data[121][1])
# is zero
print(training_data[0][1])


1
0


In [13]:
import pandas as pd

df = []

for i in training_data:
    dict_before_df = {}
    dict_before_df["label"] = i[1]
    dict_before_df["image"] = i[0]
    df.append(dict_before_df)
df = pd.DataFrame(df)

In [15]:
from sklearn.model_selection import KFold
import numpy as np

# Define the number of folds
k = 10

# Create a KFold object
kf = KFold(n_splits=k)
scores = []



# Loop over the folds
for train_index, test_index in kf.split(training_data):
    # Get the training and test sets for this fold
    
    temp_train = np.array(training_data)[train_index]

    temp_validate = np.array(training_data)[test_index]
    # y_train, y_test = training_data[train_index][0], training_data[test_index][0]
    
    # temp_train_dir_x = os.listdir(train_dir+"/benign")[train_index[0]:train_index[-1]]
    # temp_train_dir_x.extend(os.listdir(train_dir+"/malignant")[train_index[0]:train_index[-1]])
    # temp_validation_dir_x = os.listdir(validation_dir)[train_index[0]:train_index[-1]]
    # temp_validation_dir_x.extend(os.listdir(validation_dir)[train_index[0]:train_index[-1]])
   
    labels = np.array([x[1] for x in temp_train])
    images = np.array([x[0] for x in temp_train])
    print(len(labels[~np.isnan(labels)]), len(labels))
    print(len(images[~np.isnan(images)]), len(images))

    # images = images[~np.isnan(images)]

    # Train the model on the training set
    model.fit(images ,labels , epochs=10, verbose=1, batch_size=20)

    # Evaluate the model on the test set
    score = model.score(temp_validate, verbose=0)

    # Append the score to a list
    scores.append(score)
    print(score)

# Calculate the mean score
mean_score = np.mean(scores)

  temp_train = np.array(training_data)[train_index]
  temp_validate = np.array(training_data)[test_index]


2968 2968
446767104 2968
Epoch 1/10
  3/149 [..............................] - ETA: 3:37 - loss: 1.9828 - acc: 0.4500

KeyboardInterrupt: 