In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import random
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import os
import warnings
warnings.filterwarnings('ignore')

# Dataset Paths
train_dir = 'xray_dataset_covid19//train'
test_dir = 'xray_dataset_covid19//test'

# Image Preprocessing
img_height, img_width = 150, 150
batch_size = 32

data_generator = ImageDataGenerator(rescale=1./255)

train_data = data_generator.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=False
)

test_data = data_generator.flow_from_directory(
    test_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary'
)

original_labels = train_data.classes.copy()

# Data Poisoning
def apply_label_flip(labels, flip_rate=0.05):
    poisoned_labels = labels.copy()
    num_flips = int(len(labels) * flip_rate)
    flip_indices = random.sample(range(len(labels)), num_flips)
    for idx in flip_indices:
        poisoned_labels[idx] = 1 - poisoned_labels[idx]
    return poisoned_labels, flip_indices

def apply_feature_flip(images, flip_rate=0.05):
    poisoned_images = images.copy()
    num_flips = int(len(images) * flip_rate)
    flip_indices = random.sample(range(len(images)), num_flips)
    for idx in flip_indices:
        # Simple feature flip (e.g., invert pixel values)
        poisoned_images[idx] = 1 - poisoned_images[idx]
    return poisoned_images, flip_indices

def apply_backdoor(images, labels, target_label=1, backdoor_pattern=(0.8, 0.8, 0.8), position=(0.1, 0.1), size=(0.05, 0.05), backdoor_rate=0.05):
    poisoned_images = images.copy()
    poisoned_labels = labels.copy()
    num_backdoors = int(len(images) * backdoor_rate)
    backdoor_indices = random.sample(range(len(images)), num_backdoors)

    for idx in backdoor_indices:
        img_height, img_width, _ = poisoned_images[idx].shape
        x_start = int(position[0] * img_width)
        y_start = int(position[1] * img_height)
        pattern_width = int(size[0] * img_width)
        pattern_height = int(size[1] * img_height)

        poisoned_images[idx][y_start:y_start+pattern_height, x_start:x_start+pattern_width] = backdoor_pattern
        poisoned_labels[idx] = target_label  # Change label to the target

    return poisoned_images, poisoned_labels, backdoor_indices

poisoned_labels, flipped_indices = apply_label_flip(original_labels)

# Baseline Model Construction and Training
def build_cnn(input_shape=(img_height, img_width, 3)):
    model = models.Sequential([
        layers.Input(shape=input_shape), # Correct
        layers.Conv2D(32, (3, 3), activation='relu'),
        layers.MaxPooling2D(2, 2),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D(2, 2),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D(2, 2),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
    
baseline_model = build_cnn()

# Generator with poisoned labels
poisoned_train_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=False
)

poisoned_train_generator.classes = poisoned_labels

baseline_model.fit(
    poisoned_train_generator,
    epochs=1,
    validation_data=test_data
)

def detect_anomalies_batched(generator, anomaly_detector):
    anomaly_indices_list =[]
    num_batches = len(generator)
    for i in range(num_batches):
        batch_features, batch_labels = next(generator)
        flattened_features = batch_features.reshape(batch_features.shape[0], -1)
        if hasattr(anomaly_detector, 'fit_predict'):
            anomalies = anomaly_detector.fit_predict(flattened_features)
        elif hasattr(anomaly_detector, 'predict'):
            anomalies = anomaly_detector.predict(flattened_features)
        else:
            raise ValueError("Anomaly detector must have 'fit_predict' or 'predict' method")

        anomaly_indices = np.where(anomalies == -1)[0]
        if len(anomaly_indices) > 0:
            global_indices = anomaly_indices + i * generator.batch_size
            anomaly_indices_list.extend(global_indices)

    return np.array(anomaly_indices_list)

# Perform anomaly detection in a memory-efficient way
anomaly_detector_if = IsolationForest(contamination=0.05)
anomaly_indices_if = detect_anomalies_batched(poisoned_train_generator, anomaly_detector_if)

anomaly_detector_km = KMeans(n_clusters=2, random_state=42)
anomaly_indices_km = detect_anomalies_batched(poisoned_train_generator, anomaly_detector_km)

combined_anomaly_indices = np.union1d(anomaly_indices_if, anomaly_indices_km)

print(f"Number of detected anomalies: {len(combined_anomaly_indices)}")

# Robust Model Training
combined_anomaly_set = set(combined_anomaly_indices)
refined_indices = [i for i in range(len(original_labels)) if i not in combined_anomaly_set]

print(f"Number of refined indices: {len(refined_indices)}")

print(f"Percentage of anomalies detected: {len(combined_anomaly_indices)*100/(len(combined_anomaly_indices)+len(refined_indices))}")


def refined_generator(generator, indices, batch_size):
    all_features = []
    all_labels = []
    generator.reset()
    for features, labels in generator:
        all_features.append(features)
        all_labels.append(labels)
    all_features = np.concatenate(all_features)
    all_labels = np.concatenate(all_labels)

    num_samples = len(indices)
    while True:
        batch_indices = np.random.choice(num_samples, batch_size, replace=False)
        selected_indices = [indices[i] for i in batch_indices]

        batch_x = all_features[selected_indices]
        batch_y = all_labels[selected_indices]

        yield batch_x, batch_y

refined_train_generator = refined_generator(poisoned_train_generator, refined_indices, batch_size)


Found 148 images belonging to 2 classes.
Found 40 images belonging to 2 classes.
Found 148 images belonging to 2 classes.
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.4473 - loss: 2.0629 - val_accuracy: 0.5000 - val_loss: 0.7179
Number of detected anomalies: 9
Number of refined indices: 139
Percentage of anomalies detected: 6.081081081081081


In [None]:
robust_model = build_cnn()

# Training with refined data
epochs = 1
steps_per_epoch = len(refined_indices) // batch_size
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for step in range(steps_per_epoch):
        batch_features, batch_labels = next(refined_train_generator)
        print(f"Batch features shape: {batch_features.shape}")
        print(f"Batch labels shape: {batch_labels.shape}")
        robust_model.train_on_batch(batch_features, batch_labels)

# Evaluation
_, baseline_accuracy = baseline_model.evaluate(test_data)
_, robust_accuracy = robust_model.evaluate(test_data)

print(f"Baseline Accuracy (Poisoned): {baseline_accuracy}")
print(f"Robust Model Accuracy: {robust_accuracy}")

Epoch 1/1
