In [None]:
# Using google Colab
# Install necessary packages
!pip install pyspark medmnist tensorflow findspark scikit-learn matplotlib

# Initialize findspark
import findspark
findspark.init()

# Import necessary libraries
from pyspark.sql import SparkSession
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from medmnist import INFO, ChestMNIST
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import resample
from sklearn.preprocessing import label_binarize


# Initialize Spark session
spark = SparkSession.builder \
    .appName("Final Project Team 19") \
    .getOrCreate()

# Dataset Configuration
data_flag = 'chestmnist'
info = INFO[data_flag]

# Load the dataset using MedMNIST library
train_dataset = ChestMNIST(split='train', download=True)
val_dataset = ChestMNIST(split='val', download=True)
test_dataset = ChestMNIST(split='test', download=True)

X_train, X_val, X_test = train_dataset.imgs, val_dataset.imgs, test_dataset.imgs

# Convert multi-label data to binary (0 or 1)
def convert2Binary(labels):
    return np.where(labels.sum(axis=1) > 0, 1, 0)

# Apply binary conversion to labels
y_train = convert2Binary(train_dataset.labels)
y_val = convert2Binary(val_dataset.labels)
y_test = convert2Binary(test_dataset.labels)

# Normalize pixel values
X_train, X_val, X_test = X_train / 255.0, X_val / 255.0, X_test / 255.0

# Add channel dimension for CNN
X_train = X_train[..., np.newaxis]
X_val = X_val[..., np.newaxis]
X_test = X_test[..., np.newaxis]

# Rebalance the Dataset (Over-sampling minority class)
def rebalance_data(X, y):
    X_negative = X[y == 0]
    y_negative = y[y == 0]
    X_positive = X[y == 1]
    y_positive = y[y == 1]

    # Over-sample positive class
    X_positive_oversampled, y_positive_oversampled = resample(
        X_positive, y_positive,
        replace=True,
        n_samples=len(y_negative),
        random_state=42
    )

    # Combine oversampled positive and negative samples
    X_balanced = np.vstack((X_negative, X_positive_oversampled))
    y_balanced = np.hstack((y_negative, y_positive_oversampled))

    # Shuffle the dataset
    indices = np.arange(len(y_balanced))
    np.random.shuffle(indices)
    return X_balanced[indices], y_balanced[indices]

# Rebalance training data
X_train_balanced, y_train_balanced = rebalance_data(X_train, y_train)

# Visualize Class Distribution
def plot_class_distribution(labels, title="Class Distribution"):
    unique, counts = np.unique(labels, return_counts=True)
    plt.bar(unique, counts, color=['blue', 'orange'])
    plt.xlabel("Class")
    plt.ylabel("Frequency")
    plt.title(title)
    plt.xticks(unique, ['Negative', 'Positive'])
    plt.show()

plot_class_distribution(y_train_balanced, title="Balanced Class Distribution")

# Define the CNN model using keras
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the Model using Keras
history = model.fit(X_train_balanced, y_train_balanced, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the Model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_acc:.2f}")

# Confusion Matrix
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_binary, display_labels=['Negative', 'Positive'])
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred_binary, target_names=['Negative', 'Positive']))

# Training History Visualization
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()
