In [1]:
import os
import logging
import time
import sys
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from PIL import Image

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler('kidney_tumor_training.log', mode='w'),
        logging.StreamHandler(sys.stdout)
    ]
)

In [2]:
def load_kidney_tumor_data(excel_path, base_scan_path):
    logging.info(f"Starting data loading from {excel_path}")
    start_time = time.time()
    
    df = pd.read_excel(excel_path)
    df['binary_label'] = df['Situation'].map({
        'Tumor': 1, 
        'Normal case with cyst': 0, 
        'Normal case': 0
    })
    
    image_paths = []
    labels = []
    skipped_patients = 0
    
    for patient_id in df.index:
        patient_folder = os.path.join(base_scan_path, f"{patient_id:02d}")
        
        if os.path.exists(patient_folder):
            patient_label = df.loc[patient_id, 'binary_label']
            
            for subfolder in os.listdir(patient_folder):
                subfolder_path = os.path.join(patient_folder, subfolder)
                
                if os.path.isdir(subfolder_path):
                    for img_file in os.listdir(subfolder_path):
                        if img_file.endswith('.jpg'):
                            img_path = os.path.join(subfolder_path, img_file)
                            image_paths.append(img_path)
                            labels.append(patient_label)
        else:
            skipped_patients += 1
    
    logging.info(f"Data Loading Time: {time.time() - start_time:.2f} seconds")
    logging.info(f"Total Images: {len(image_paths)}")
    logging.info(f"Skipped Patients: {skipped_patients}")
    logging.info(f"Class Distribution:")
    logging.info(f"  Negative Class: {labels.count(0)} ({labels.count(0)/len(labels)*100:.2f}%)")
    logging.info(f"  Positive Class: {labels.count(1)} ({labels.count(1)/len(labels)*100:.2f}%)")
    
    return image_paths, labels

In [3]:
class KidneyTumorDataset(Dataset):
    def __init__(self, image_paths, labels):
        self.image_paths = image_paths
        self.labels = labels
        
        self.transform = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.Grayscale(num_output_channels=3),
            transforms.ToTensor(),
        ])
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        try:
            image = Image.open(self.image_paths[idx]).convert('RGB')
            image = self.transform(image)
            label = torch.tensor(self.labels[idx], dtype=torch.float32)
            return image, label
        except Exception as e:
            logging.error(f"Error processing image {self.image_paths[idx]}: {e}")
            return None, None

In [4]:
def create_data_loaders(image_paths, labels, batch_size=32):
    logging.info("Creating train-validation split")
    start_time = time.time()
    
    X_train, X_val, y_train, y_val = train_test_split(
        image_paths, labels, 
        test_size=0.2, 
        stratify=labels, 
        random_state=42
    )
    
    train_dataset = KidneyTumorDataset(X_train, y_train)
    val_dataset = KidneyTumorDataset(X_val, y_val)
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=0
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=0
    )
    
    logging.info(f"Data Loader Creation Time: {time.time() - start_time:.2f} seconds")
    logging.info(f"Train Batches: {len(train_loader)}")
    logging.info(f"Validation Batches: {len(val_loader)}")
    
    return train_loader, val_loader

In [5]:
class SimpleTumorClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.fc_layers = nn.Sequential(
            nn.Linear(32 * 32 * 32, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        return self.fc_layers(x)

In [6]:
def train_model(model, train_loader, val_loader, epochs=5):
    logging.info("Starting Model Training")
    total_start_time = time.time()
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    device = torch.device('cpu')
    model.to(device)
    
    for epoch in range(epochs):
        epoch_start_time = time.time()
        model.train()
        total_train_loss = 0
        total_correct = 0
        total_samples = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            if images is None or labels is None:
                continue
            
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
            predictions = (outputs > 0.5).float()
            total_correct += (predictions == labels).float().sum().item()
            total_samples += labels.size(0)
            
            if batch_idx % 10 == 0:
                logging.info(f"Epoch {epoch+1}, Batch {batch_idx}: Loss = {loss.item():.4f}")
        
        model.eval()
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0
        
        with torch.no_grad():
            for images, labels in val_loader:
                if images is None or labels is None:
                    continue
                
                images, labels = images.to(device), labels.to(device)
                outputs = model(images).squeeze()
                val_loss = criterion(outputs, labels)
                
                total_val_loss += val_loss.item()
                predictions = (outputs > 0.5).float()
                total_val_correct += (predictions == labels).float().sum().item()
                total_val_samples += labels.size(0)
        
        train_accuracy = total_correct / total_samples
        val_accuracy = total_val_correct / total_val_samples
        
        logging.info(f"Epoch {epoch+1} Summary:")
        logging.info(f"  Train Loss: {total_train_loss/len(train_loader):.4f}")
        logging.info(f"  Train Accuracy: {train_accuracy:.4f}")
        logging.info(f"  Validation Loss: {total_val_loss/len(val_loader):.4f}")
        logging.info(f"  Validation Accuracy: {val_accuracy:.4f}")
        logging.info(f"  Epoch Time: {time.time() - epoch_start_time:.2f} seconds")
    
    logging.info(f"Total Training Time: {time.time() - total_start_time:.2f} seconds")
    return model

In [7]:
def evaluate_model(model, val_loader):
    logging.info("Starting Model Evaluation")
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in val_loader:
            if images is None or labels is None:
                continue
            
            outputs = model(images).squeeze()
            predictions = (outputs > 0.5).float()
            
            all_preds.extend(predictions.numpy())
            all_labels.extend(labels.numpy())
    
    # Classification Report
    report = classification_report(all_labels, all_preds)
    logging.info("\nClassification Report:\n" + report)
    
    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    logging.info("\nConfusion Matrix:\n" + str(cm))
    
    # Visualization
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig('confusion_matrix.png')
    logging.info("Confusion matrix plot saved")
    
    return report, cm


In [8]:
def evaluate_model(model, val_loader):
    logging.info("Starting Model Evaluation")
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in val_loader:
            if images is None or labels is None:
                continue
            
            outputs = model(images).squeeze()
            predictions = (outputs > 0.5).float()
            
            all_preds.extend(predictions.numpy())
            all_labels.extend(labels.numpy())
    
    # Classification Report
    report = classification_report(all_labels, all_preds)
    logging.info("\nClassification Report:\n" + report)
    
    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    logging.info("\nConfusion Matrix:\n" + str(cm))
    
    return report, cm

In [9]:
def save_model(model, filepath='kidney_tumor_model.pth'):
    logging.info(f"Saving model to {filepath}")
    torch.save(model.state_dict(), filepath)
    logging.info("Model saved successfully")

def main():
    # Data preparation
    excel_path = 'Dataset/00Kidney_Patients.xlsx'
    base_scan_path = 'unzipped_scans'
    
    # Load data
    image_paths, labels = load_kidney_tumor_data(excel_path, base_scan_path)
    
    # Create data loaders
    train_loader, val_loader = create_data_loaders(image_paths, labels)
    
    # Initialize model
    model = SimpleTumorClassifier()
    
    # Train model
    trained_model = train_model(model, train_loader, val_loader)
    
    # Evaluate model
    eval_report, conf_matrix = evaluate_model(trained_model, val_loader)
    
    # Save model
    save_model(trained_model)

if __name__ == '__main__':
    main()

2025-01-25 20:29:05,571 - INFO: Starting data loading from Dataset/00Kidney_Patients.xlsx
2025-01-25 20:29:05,904 - INFO: Data Loading Time: 0.33 seconds
2025-01-25 20:29:05,904 - INFO: Total Images: 7701
2025-01-25 20:29:05,904 - INFO: Skipped Patients: 10
2025-01-25 20:29:05,904 - INFO: Class Distribution:
2025-01-25 20:29:05,919 - INFO:   Negative Class: 3570 (46.36%)
2025-01-25 20:29:05,920 - INFO:   Positive Class: 4131 (53.64%)
2025-01-25 20:29:05,922 - INFO: Creating train-validation split
2025-01-25 20:29:05,925 - INFO: Data Loader Creation Time: 0.00 seconds
2025-01-25 20:29:05,925 - INFO: Train Batches: 193
2025-01-25 20:29:05,925 - INFO: Validation Batches: 49
2025-01-25 20:29:05,958 - INFO: Starting Model Training
2025-01-25 20:29:06,422 - INFO: Epoch 1, Batch 0: Loss = 0.7009
2025-01-25 20:29:10,789 - INFO: Epoch 1, Batch 10: Loss = 0.6947
2025-01-25 20:29:15,177 - INFO: Epoch 1, Batch 20: Loss = 0.6529
2025-01-25 20:29:19,946 - INFO: Epoch 1, Batch 30: Loss = 0.6705
2025-