In [None]:
from azureml.core import Workspace, Dataset

subscription_id = 'ff71bc5a-d809-4062-bd54-01d3ea83e738'
resource_group = '1'
workspace_name = 'nn1'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='data2')
dataset.download(target_path='.', overwrite=False)

In [None]:

# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'ff71bc5a-d809-4062-bd54-01d3ea83e738'
resource_group = '1'
workspace_name = 'nn1'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='data3')
dataset.download(target_path='.', overwrite=False)

In [5]:
import os
import zipfile
local_zip = 'processed_images.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('')
zip_ref.close()

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.layers import Dropout, GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.regularizers import l1_l2

# Assuming the ImageDataGeneratorWithPaths class is correctly implemented above this code.

def prepare_data(processed_images_dir, csv_dir='csv'):
    # Load CSV files
    mass_train = pd.read_csv(os.path.join(csv_dir, 'mass_case_description_train_set.csv'))
    mass_test = pd.read_csv(os.path.join(csv_dir, 'mass_case_description_test_set.csv'))

    # Combine train and test datasets
    full_mass = pd.concat([mass_train, mass_test], axis=0)

    # Sample data if needed (to reduce size for testing)
    full_mass_sample = full_mass.sample(n=1696, random_state=42)

    # Map pathology to binary labels
    class_mapper = {'MALIGNANT': 1, 'BENIGN': 0, 'BENIGN_WITHOUT_CALLBACK': 0}
    full_mass_sample['labels'] = full_mass_sample['pathology'].replace(class_mapper)


    # Correct file paths to point to the processed images
    full_mass_sample['processed_image'] = full_mass_sample.index.to_series().apply(
        lambda i: os.path.join(processed_images_dir, f'processed_{i}.jpg'))

    return full_mass_sample

def build_model():
    base_model = InceptionV3(weights='imagenet', include_top=False)
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu', kernel_regularizer=l1_l2(l1=0.001, l2=0.001))(x)
    x = Dropout(0.3)(x)
    predictions = Dense(1, activation='sigmoid')(x)  # Binary classification with categorical crossentropy
    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_model(full_mass_sample, target_size=(224, 224), batch_size=32, epochs=10):
    num_classes = 2  # Binary classification
    model = build_model()

    # Custom learning rate scheduler
    def lr_schedule(epoch, initial_lr=0.0001):
        lr = initial_lr
        if epoch > 10:
            lr *= 0.1
        elif epoch > 5:
            lr *= 0.5
        return lr

    # Checkpoint callback to save model weights
    model_checkpoint = ModelCheckpoint(
        filepath='model-best.weights.h5',
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True,
        verbose=1
    )

    early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

    datagen = ImageDataGenerator(
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        fill_mode='nearest')

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    best_val_accuracy = -np.inf
    best_model = None

    for fold, (train_idx, test_idx) in enumerate(kf.split(full_mass_sample)):
        print(f"Training fold {fold + 1}/5")

        train_data = full_mass_sample.iloc[train_idx]
        test_data = full_mass_sample.iloc[test_idx]

        train_images = np.array([img_to_array(load_img(img_path, target_size=target_size)) / 255.0 for img_path in train_data['processed_image']])
        test_images = np.array([img_to_array(load_img(img_path, target_size=target_size)) / 255.0 for img_path in test_data['processed_image']])
        train_labels = train_data['labels']
        test_labels = test_data['labels']

        datagen.fit(train_images)

        history = model.fit(
            datagen.flow(train_images, train_labels, batch_size=batch_size),
            validation_data=(test_images, test_labels),
            epochs=epochs,
            callbacks=[LearningRateScheduler(lr_schedule), early_stopping, model_checkpoint]
        )

        val_accuracy = max(history.history['val_accuracy'])
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model = model

    if best_model:
        best_model.save('final_model.h5')
        print("Best model saved to final_model.h5")

# Prepare the data
full_mass_sample = prepare_data('processed_images/sorted_files')
# Train the model
train_model(full_mass_sample)
