Train Test Split


In [1]:
import os
import numpy as np
import shutil
from sklearn.model_selection import train_test_split

# Define paths
input_dir = "Preprocessed_Images"
train_dir = "Dataset/Train"
test_dir = "Dataset/Test"

# Create train/test directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Loop through each disease category
for disease in os.listdir(input_dir):
    disease_path = os.path.join(input_dir, disease)
    images = os.listdir(disease_path)
    
    # Split into train (80%) and test (20%)
    train_images, test_images = train_test_split(images, test_size=0.2, random_state=42)

    # Create disease subfolders in train & test directories
    os.makedirs(os.path.join(train_dir, disease), exist_ok=True)
    os.makedirs(os.path.join(test_dir, disease), exist_ok=True)

    # Move files to respective folders
    for img in train_images:
        shutil.copy(os.path.join(disease_path, img), os.path.join(train_dir, disease, img))

    for img in test_images:
        shutil.copy(os.path.join(disease_path, img), os.path.join(test_dir, disease, img))

print("✅ Train-Test Split Complete!")


✅ Train-Test Split Complete!


In [2]:
# Count files in Train and Test sets
print("Train Set:")
for disease in os.listdir(train_dir):
    print(f"{disease}: {len(os.listdir(os.path.join(train_dir, disease)))} images")

print("\nTest Set:")
for disease in os.listdir(test_dir):
    print(f"{disease}: {len(os.listdir(os.path.join(test_dir, disease)))} images")


Train Set:
Atelectasis: 368 images
Cardiomegaly: 156 images
Consolidation: 164 images
Edema: 72 images
Effusion: 389 images
Emphysema: 100 images
Fibrosis: 137 images
Hernia: 21 images
Infiltration: 664 images
Mass: 128 images
No Finding: 2203 images
Nodule: 171 images
Pleural_Thickening: 132 images
Pneumonia: 52 images
Pneumothorax: 159 images

Test Set:
Atelectasis: 92 images
Cardiomegaly: 40 images
Consolidation: 41 images
Edema: 18 images
Effusion: 98 images
Emphysema: 25 images
Fibrosis: 35 images
Hernia: 6 images
Infiltration: 166 images
Mass: 33 images
No Finding: 551 images
Nodule: 43 images
Pleural_Thickening: 33 images
Pneumonia: 13 images
Pneumothorax: 40 images


Training CNN model using Pytorch


Step1: Load Dataset


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import os
import numpy as np

# Define dataset class
class XRayDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = os.listdir(root_dir)
        self.files = []
        
        for label in self.classes:
            class_path = os.path.join(root_dir, label)
            for file in os.listdir(class_path):
                self.files.append((os.path.join(class_path, file), self.classes.index(label)))

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        img_path, label = self.files[idx]
        image = np.load(img_path)  # Load .npy image
        image = torch.tensor(image, dtype=torch.float32).unsqueeze(0)  # Add channel dimension

        if self.transform:
            image = self.transform(image)
        
        return image, label

# Define transformations (Normalization)
transform = transforms.Normalize([0.5], [0.5])

# Load datasets
train_dataset = XRayDataset("Dataset/Train", transform=transform)
test_dataset = XRayDataset("Dataset/Test", transform=transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("✅ Dataset Loaded Successfully!")


✅ Dataset Loaded Successfully!


Step2: Setting up GPU for Training process


In [2]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

CUDA Available: True
GPU Name: NVIDIA GeForce MX550


Step3:Defining CNN model


In [3]:
import torch.nn as nn
import torch.optim as optim

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define CNN Model
class CNNModel(nn.Module):
    def __init__(self, num_classes):
        super(CNNModel, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.fc_layers = nn.Sequential(
            nn.Linear(128 * 28 * 28, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc_layers(x)
        return x

# Initialize model
num_classes = len(os.listdir("Dataset/Train"))  # Number of disease categories
model = CNNModel(num_classes).to(device)

print("✅ CNN Model Defined & Moved to GPU!")

✅ CNN Model Defined & Moved to GPU!


Step4: Using CrossEntropyLoss and AdamOptimiser


In [6]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [8]:
from tqdm import tqdm
import torch.cuda

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    # Track progress with tqdm
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=True)

    for batch_idx, (images, labels) in loop:
        images, labels = images.to(device), labels.to(device)  # Move to GPU
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

        # Update tqdm progress bar with loss & GPU memory
        loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item(), gpu_mem=f"{torch.cuda.memory_allocated(device) / 1e9:.2f} GB")

    print(f"Epoch [{epoch+1}/{num_epochs}], Avg Loss: {running_loss / len(train_loader):.4f}")

print("✅ Model Training Complete on GPU!")

  0%|          | 0/154 [00:00<?, ?it/s]

Epoch [1/10]: 100%|██████████| 154/154 [20:32<00:00,  8.00s/it, gpu_mem=0.84 GB, loss=1.79]


Epoch [1/10], Avg Loss: 4.0140


Epoch [2/10]: 100%|██████████| 154/154 [18:33<00:00,  7.23s/it, gpu_mem=0.84 GB, loss=2.34]


Epoch [2/10], Avg Loss: 2.1656


Epoch [3/10]: 100%|██████████| 154/154 [18:33<00:00,  7.23s/it, gpu_mem=0.84 GB, loss=2.07]


Epoch [3/10], Avg Loss: 2.1170


Epoch [4/10]: 100%|██████████| 154/154 [18:32<00:00,  7.23s/it, gpu_mem=0.84 GB, loss=1.49]


Epoch [4/10], Avg Loss: 2.0678


Epoch [5/10]: 100%|██████████| 154/154 [18:32<00:00,  7.22s/it, gpu_mem=0.84 GB, loss=2.07]


Epoch [5/10], Avg Loss: 2.0542


Epoch [6/10]: 100%|██████████| 154/154 [18:34<00:00,  7.24s/it, gpu_mem=0.84 GB, loss=1.94]


Epoch [6/10], Avg Loss: 2.0658


Epoch [7/10]: 100%|██████████| 154/154 [18:38<00:00,  7.26s/it, gpu_mem=0.84 GB, loss=2.06]


Epoch [7/10], Avg Loss: 2.0270


Epoch [8/10]: 100%|██████████| 154/154 [18:38<00:00,  7.26s/it, gpu_mem=0.84 GB, loss=1.88]


Epoch [8/10], Avg Loss: 1.9915


Epoch [9/10]: 100%|██████████| 154/154 [18:41<00:00,  7.28s/it, gpu_mem=0.84 GB, loss=2.04]


Epoch [9/10], Avg Loss: 1.9897


Epoch [10/10]: 100%|██████████| 154/154 [18:33<00:00,  7.23s/it, gpu_mem=0.84 GB, loss=1.84]

Epoch [10/10], Avg Loss: 1.9610
✅ Model Training Complete on GPU!





In [9]:
# Define model save path
model_save_path = "Trained_Model"

# Save model state
torch.save(model.state_dict(), model_save_path)

print(f"✅ Model saved as {model_save_path}!")


✅ Model saved as Trained_Model!
