# Imports and Dependencies
Start your notebook by importing any necessary libraries and dependencies that you'll need throughout your code. This could include things like NumPy, Pandas, scikit-learn, or TensorFlow.

In [2]:
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn import metrics, model_selection, tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import random
import torch
from torch import nn, optim
import math
from IPython import display
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit

# Data Preparation
Load your dataset into memory, and perform any necessary preprocessing steps. This might include tasks like cleaning the data, handling missing values, feature scaling, encoding categorical variables, and so on.

In [5]:
# Loading the dataset into a Pandas Dataframe - train, test and validate 
    # the data is already divided into train test and validate    
from PIL import Image
import os

# Specify the directory where your images are located
data_dir = '/Users/mac/Desktop/Jupyter/BumbleKite/chest_xray'

In [62]:
from PIL import Image
import os

# Get the list of subdirectories (train, test, validate)
subdirectories = ['train', 'test', 'val']

# Get the list of classes
classes = ['NORMAL', 'PNEUMONIA']

# Create an empty dictionary to store the loaded images
loaded_images = {}

# Iterate over the subdirectories
for subdirectory in subdirectories:
    # Create an empty dictionary for the current subdirectory
    subdirectory_images = {}
    
    # Get the path to the current subdirectory
    subdirectory_path = os.path.join(data_dir, subdirectory)
    
    # Iterate over the classes
    for class_name in classes:
        # Create a list to store images from the current class
        image_list = []
        
        # Get the path to the current class subfolder
        class_dir = os.path.join(subdirectory_path, class_name)
        
        # Iterate over the files in the class subfolder
        for filename in os.listdir(class_dir):
            # Get the path to the current image file
            image_path = os.path.join(class_dir, filename)
            
            # Load the image using PIL
            image = Image.open(image_path)
            
            # Append the loaded image to the image_list
            image_list.append(image)
        
        # Close the images after appending them to the list
        for image in image_list:
            image.close()
        
        # Add the image_list to the subdirectory_images dictionary with the class name as the key
        subdirectory_images[class_name] = image_list
    
    # Add the subdirectory_images dictionary to the loaded_images dictionary with the subdirectory name as the key
    loaded_images[subdirectory] = subdirectory_images


## Data distribution 

In [63]:

# Calculate the percentage of normal and pneumonia images
for subdirectory in loaded_images:
    subdirectory_images = loaded_images[subdirectory]
    total_images = sum(len(images) for images in subdirectory_images.values())
    
    for class_name in classes:
        class_images = subdirectory_images[class_name]
        class_count = len(class_images)
        class_percentage = (class_count / total_images) * 100
        print(f"Percentage of {class_name} images in {subdirectory}: {class_percentage:.2f}%")

Percentage of NORMAL images in train: 25.71%
Percentage of PNEUMONIA images in train: 74.29%
Percentage of NORMAL images in test: 37.50%
Percentage of PNEUMONIA images in test: 62.50%
Percentage of NORMAL images in val: 50.00%
Percentage of PNEUMONIA images in val: 50.00%


In [49]:
loaded_images['train']['PNEUMONIA']

[<PIL.JpegImagePlugin.JpegImageFile image mode=L size=1048x736>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=984x672>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=992x712>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1224x888>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=864x480>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=944x584>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1816x1110>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1408x1024>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1200x552>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1336x1256>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1400x1040>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1016x608>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1360x936>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1576x1056>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=850x560>,
 <PIL.JpegImagePlugin.J

In [72]:

# from PIL import Image
# import matplotlib.pyplot as plt

# # Assuming you have loaded a single PIL image and stored it in the variable 'image'
# image = loaded_images['train']['PNEUMONIA'][0][0]

# # Display the image
# plt.imshow(image)
# plt.axis('off')  # Optional: turn off the axis
# plt.show()

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Create the CNN architecture
model = models.Sequential()

# Convolutional layers
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(image_height, image_width, 3)))
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 2)))

# Fully connected layers
model.add(layers.Flatten())
model.add(layers.Dense(4096, activation='relu'))
model.add(layers.Dense(4096, activation='relu'))

# Output layer
model.add(layers.Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()
