# Classification X-ray pneumonia images using Bigdl Approach

## Install Bigdl

In [None]:
# uninstall pyspark, we install it with bigdl because of compatibility
!pip uninstall pyspark

Found existing installation: pyspark 3.5.5
Uninstalling pyspark-3.5.5:
  Would remove:
    /usr/local/bin/beeline
    /usr/local/bin/beeline.cmd
    /usr/local/bin/docker-image-tool.sh
    /usr/local/bin/find-spark-home
    /usr/local/bin/find-spark-home.cmd
    /usr/local/bin/find_spark_home.py
    /usr/local/bin/load-spark-env.cmd
    /usr/local/bin/load-spark-env.sh
    /usr/local/bin/pyspark
    /usr/local/bin/pyspark.cmd
    /usr/local/bin/pyspark2.cmd
    /usr/local/bin/run-example
    /usr/local/bin/run-example.cmd
    /usr/local/bin/spark-class
    /usr/local/bin/spark-class.cmd
    /usr/local/bin/spark-class2.cmd
    /usr/local/bin/spark-connect-shell
    /usr/local/bin/spark-shell
    /usr/local/bin/spark-shell.cmd
    /usr/local/bin/spark-shell2.cmd
    /usr/local/bin/spark-sql
    /usr/local/bin/spark-sql.cmd
    /usr/local/bin/spark-sql2.cmd
    /usr/local/bin/spark-submit
    /usr/local/bin/spark-submit.cmd
    /usr/local/bin/spark-submit2.cmd
    /usr/local/bin/sparkR
  

In [None]:
pip install bigdl-orca-spark3

Collecting bigdl-orca-spark3
  Downloading bigdl_orca_spark3-2.4.0-py3-none-manylinux1_x86_64.whl.metadata (2.2 kB)
Collecting bigdl-tf==2.4.0.dev0 (from bigdl-orca-spark3)
  Downloading bigdl_tf-2.4.0.dev0-py3-none-manylinux2010_x86_64.whl.metadata (299 bytes)
Collecting bigdl-math==2.4.0.dev0 (from bigdl-orca-spark3)
  Downloading bigdl_math-2.4.0.dev0-py3-none-manylinux2010_x86_64.whl.metadata (295 bytes)
Collecting bigdl-dllib-spark3==2.4.0 (from bigdl-orca-spark3)
  Downloading bigdl_dllib_spark3-2.4.0-py3-none-manylinux1_x86_64.whl.metadata (1.0 kB)
Collecting pyspark==3.1.3 (from bigdl-dllib-spark3==2.4.0->bigdl-orca-spark3)
  Downloading pyspark-3.1.3.tar.gz (214.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.0/214.0 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting conda-pack==0.3.1 (from bigdl-dllib-spark3==2.4.0->bigdl-orca-spark3)
  Downloading conda_pack-0.3.1-py2.py3-none-any.whl.

## Start SparkContext

In [None]:
from bigdl.orca import init_orca_context
init_orca_context(cluster_mode="local", memory="8g")

Initializing orca context
For Spark local mode, default to use all the cores on the node.
Current pyspark location is : /usr/local/lib/python3.11/dist-packages/pyspark/__init__.py
Start to getOrCreate SparkContext
pyspark_submit_args is:  --driver-class-path /usr/local/lib/python3.11/dist-packages/bigdl/share/dllib/lib/bigdl-dllib-spark_3.1.3-2.4.0-jar-with-dependencies.jar:/usr/local/lib/python3.11/dist-packages/bigdl/share/core/lib/all-2.4.0-20230420.050641-1.jar:/usr/local/lib/python3.11/dist-packages/bigdl/share/orca/lib/bigdl-orca-spark_3.1.3-2.4.0-jar-with-dependencies.jar pyspark-shell 
Successfully got a SparkContext


## Import Packages and Modules

In [None]:
import os
import shutil
import zipfile

import tensorflow as tf
from tensorflow import keras


## Download and Extract Dataset

In [None]:
def downlaod_extract_dataset():
  """
  This function downloads and extracts x-ray images from kaggle
  """

  # Create the .kaggle directory if it doesn't exist
  os.makedirs("/root/.kaggle", exist_ok=True)

  # Move the kaggle.json file
  shutil.move("kaggle.json", "/root/.kaggle/")

  # Set permissions
  os.chmod("/root/.kaggle/kaggle.json", 600)

  # Download kaggle dataset
  !kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

  # Unzip the dataset
  with zipfile.ZipFile("/content/chest-xray-pneumonia.zip", 'r') as zip_ref:
    zip_ref.extractall("dataset")

  print("Dataset extracted successfully!")


In [None]:
downlaod_extract_dataset()

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia
License(s): other
Downloading chest-xray-pneumonia.zip to /content
100% 2.29G/2.29G [00:21<00:00, 242MB/s]
100% 2.29G/2.29G [00:21<00:00, 117MB/s]
Dataset extracted successfully!


## Split dataset into Train, Test, Val

In [None]:
# Directories for train,val, test data
train_dir = "/content/dataset/chest_xray/train/"
val_dir = "/content/dataset/chest_xray/val/"
test_dir = "/content/dataset/chest_xray/test/"

In [None]:
BATCH_SIZE = 32
IMG_SIZE = 224
CHANNEL = 1


train_data = keras.preprocessing.image_dataset_from_directory(
    directory = train_dir,
    image_size = (IMG_SIZE,IMG_SIZE),
    label_mode = 'binary',
    batch_size = BATCH_SIZE,
    shuffle = True
).cache().shuffle(1000).prefetch(buffer_size=tf.data.AUTOTUNE)

test_data = keras.preprocessing.image_dataset_from_directory(
    directory = test_dir,
    image_size = (IMG_SIZE,IMG_SIZE),
    label_mode = 'binary',
    batch_size = BATCH_SIZE
).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

val_data = keras.preprocessing.image_dataset_from_directory(
    directory = val_dir,
    image_size = (IMG_SIZE,IMG_SIZE),
    label_mode = 'binary',
    batch_size = BATCH_SIZE
)

# The number of classes
class_names = val_data.class_names
val_data = val_data.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
print(f"Class Names: {class_names}")

Found 5216 files belonging to 2 classes.
Found 624 files belonging to 2 classes.
Found 16 files belonging to 2 classes.
Class Names: ['NORMAL', 'PNEUMONIA']


## Define Model

In [None]:
# Resize and Scaling Images
data_augmentation = keras.Sequential([
  keras.layers.Resizing(height=224, width=224),
  keras.layers.Rescaling(1./255)
], name ="data_augmentation")



model = keras.Sequential(
    [data_augmentation,
     keras.layers.Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='relu',
                         padding='valid'),
     keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'),
     keras.layers.Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='relu',
                         padding='valid'),
     keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'),
     keras.layers.Flatten(),
     keras.layers.Dense(500, activation='relu'),
     keras.layers.Dense(1, activation='sigmoid'),
     ]
)

model.compile(optimizer=keras.optimizers.RMSprop(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

## Orca Estimator

In [None]:
from bigdl.orca.learn.tf2.estimator import Estimator

In [None]:
!pip install ray[default]

Collecting ray[default]
  Downloading ray-2.43.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (19 kB)
Collecting aiohttp_cors (from ray[default])
  Downloading aiohttp_cors-0.8.0-py3-none-any.whl.metadata (20 kB)
Collecting colorful (from ray[default])
  Downloading colorful-0.5.6-py2.py3-none-any.whl.metadata (16 kB)
Collecting py-spy>=0.2.0 (from ray[default])
  Downloading py_spy-0.4.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (16 kB)
Collecting opencensus (from ray[default])
  Downloading opencensus-0.11.4-py2.py3-none-any.whl.metadata (12 kB)
Collecting virtualenv!=20.21.1,>=20.0.24 (from ray[default])
  Downloading virtualenv-20.29.3-py3-none-any.whl.metadata (4.5 kB)
Collecting distlib<1,>=0.3.7 (from virtualenv!=20.21.1,>=20.0.24->ray[default])
  Downloading distlib-0.3.9-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting opencensus-context>=0.1.3 (from opencensus->ray[default])
  Downloading opencensus_context-0.1.3-py2.py3-none-any.whl.metadata (3.3 kB)

In [None]:
est = Estimator.from_keras(keras_model=model)

2025-03-19 10:16:08,040	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://172.28.0.12:8265 [39m[22m


RayContext(dashboard_url='172.28.0.12:8265', python_version='3.11.11', ray_version='2.43.0', ray_commit='ecdcdc6a6e63dc4bcd6ea16aae256ce4d32a7e2c')


[36m(Worker pid=2139)[0m 2025-03-19 10:16:14.488769: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(Worker pid=2139)[0m E0000 00:00:1742379374.525150    2139 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(Worker pid=2139)[0m E0000 00:00:1742379374.538242    2139 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(Worker pid=2139)[0m Instructions for updating:
[36m(Worker pid=2139)[0m use distribute.MultiWorkerMirroredStrategy instead
[36m(Worker pid=2139)[0m 2025-03-19 10:16:18.982909: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capab

## Fit Estimator on train data

In [None]:
est.fit(data=train_data,
        batch_size=BATCH_SIZE,
        epochs=30,
        validation_data=val_data)

## Evalute the Estimator

In [None]:
result = est.evaluate(test_data)
print(f"The Performance of the Model is: {result}")

## Stop SparkContext

In [18]:
from bigdl.orca import stop_orca_context

In [None]:
stop_orca_context()

-----------------