In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import cv2
import networkx as nx
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
import pickle
from torch.utils.data import Dataset, DataLoader
import gc

In [2]:
class ImageDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['FilePath']
        label = self.df.iloc[idx]['Label']
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
            
        return image, label, idx

In [3]:
def scan_folder_to_dataframe(base_folder):
    data = [(os.path.join(root, file), os.path.basename(root))
            for root, _, files in os.walk(base_folder)
            for file in files]
    return pd.DataFrame(data, columns=['FilePath', 'Label'])


In [4]:
def save_features_increment(features_dict, output_file):
    """Save features incrementally to avoid memory buildup"""
    if os.path.exists(output_file):
        with open(output_file, 'rb') as f:
            existing_dict = pickle.load(f)
        existing_dict.update(features_dict)
        features_dict = existing_dict
    
    with open(output_file, 'wb') as f:
        pickle.dump(features_dict, f)

In [5]:
def extract_features_batch(model, dataloader, device, output_file, batch_size=32):
    """Extract features in batches and save incrementally to avoid memory buildup"""
    model.eval()
    features_dict = {}
    
    with torch.no_grad():
        for batch_imgs, batch_labels, batch_indices in dataloader:
            batch_imgs = batch_imgs.to(device)
            
            # Extract features
            features = model(batch_imgs)
            features = features.cpu().numpy()
            
            # Store features and free memory
            for idx, (label, feature) in enumerate(zip(batch_labels, features)):
                original_idx = batch_indices[idx].item()
                features_dict[original_idx] = {
                    'label': label,
                    'features': feature
                }
            
            # Clear GPU memory
            del batch_imgs, features
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
            # Incrementally save to disk if dictionary gets too large
            if len(features_dict) >= 1000:
                save_features_increment(features_dict, output_file)
                features_dict.clear()
    
    # Save any remaining features
    if features_dict:
        save_features_increment(features_dict, output_file)


In [6]:
def build_graph_from_features(feature_file_path, graph_file_path, batch_size=1000):
    """Build graph in batches to reduce memory usage"""
    # Load features in batches
    G = nx.Graph()
    
    with open(feature_file_path, 'rb') as f:
        features_dict = pickle.load(f)
    
    # Add nodes first
    for index, data in features_dict.items():
        G.add_node(index, label=data['label'])
    
    # Process edges in batches
    nodes = list(G.nodes())
    n_nodes = len(nodes)
    
    for i in range(0, n_nodes, batch_size):
        batch_nodes = nodes[i:i + batch_size]
        batch_features = np.array([features_dict[node]['features'] for node in batch_nodes])
        
        # Calculate similarities for this batch with all other nodes
        for j in range(i, n_nodes, batch_size):
            other_nodes = nodes[j:j + batch_size]
            other_features = np.array([features_dict[node]['features'] for node in other_nodes])
            
            # Calculate similarities using matrix operations
            similarities = batch_features @ other_features.T
            norms_1 = np.linalg.norm(batch_features, axis=1)
            norms_2 = np.linalg.norm(other_features, axis=1)
            similarities = similarities / np.outer(norms_1, norms_2)
            
            # Add edges for high similarities
            for idx1, node1 in enumerate(batch_nodes):
                for idx2, node2 in enumerate(other_nodes[idx1:], idx1):
                    if similarities[idx1, idx2] > 0.5:
                        G.add_edge(node1, node2, weight=float(similarities[idx1, idx2]))
        
        # Clear batch data
        del batch_features
        gc.collect()
    
    # Save the graph
    nx.write_gpickle(G, graph_file_path)
    print(f"Graph saved to {graph_file_path}")
    return G

In [7]:
def main(base_folder, batch_size=32):
    # Create dataset
    dataset = scan_folder_to_dataframe(base_folder)
    train_set, test_set = train_test_split(dataset, test_size=0.2, stratify=dataset['Label'], random_state=42)
    
    # Setup data loading
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    train_dataset = ImageDataset(train_set, transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    
    # Setup model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = resnet50(pretrained=True).to(device)
    
    # Extract features
    feature_file = "features.pkl"
    extract_features_batch(model, train_loader, device, feature_file, batch_size)
    
    # Build graph
    graph_file = "graph.pkl"
    G = build_graph_from_features(feature_file, graph_file)
    
    return G

In [None]:
if __name__ == "__main__":
    base_folder = "lung_image_sets"
    G = main(base_folder)

