In [1]:
import os
import numpy as np
import pandas as pd
import torch
import json
from PIL import Image  
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torch.nn.functional as F
import torch.nn as nn
from fastai.learner import Learner
from fastai.data.core import DataLoaders
from fastai.metrics import accuracy
from fastai.losses import CrossEntropyLossFlat
from fastai.callback.all import SaveModelCallback, EarlyStoppingCallback
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
%matplotlib inline

In [2]:
# Load datasets
df = pd.read_csv('Final_Datasets/pancreas.csv')
df_test = pd.read_csv('Final_Datasets/test_data_incidence.csv')
controls = pd.read_csv('Final_Datasets/imbalanced_control_iid.csv')

# Remove test cases and controls
test_iids = set(df_test['IID'])

# Get all cases (CAD = 1) from pancreas.csv, excluding test data
cases_train = df[(df['CAD'] == 1) & (~df['IID'].isin(test_iids))]

# Get all controls (IID in controls.csv and CAD = 0), excluding test data
controls_train = df[(df['IID'].isin(controls['IID'])) & (df['CAD'] == 0) & (~df['IID'].isin(test_iids))]

# Combine the imbalanced dataset
df_imbalanced_train = pd.concat([cases_train, controls_train]).reset_index(drop=True)

# Save the imbalanced dataset
df_imbalanced_train.to_csv('Final_Datasets/train_imbalanced_pancreas.csv', sep=',', index=False)

# Report the dataset statistics
cases_count = len(cases_train)
controls_count = len(controls_train)
print(f"Number of cases: {cases_count}")
print(f"Number of controls: {controls_count}")
print(f"Imbalance ratio (controls to cases): {controls_count / cases_count:.2f}")

Number of cases: 3495
Number of controls: 26358
Imbalance ratio (controls to cases): 7.54


In [3]:
# Backbone Class for Feature Extraction
class Backbone(nn.Module):
    def __init__(self):
        super().__init__()
        base_model = models.resnet50(pretrained=False)
        encoder_layers = list(base_model.children())
        self.backbone = nn.Sequential(*encoder_layers[:9])  # Use the first 9 layers of ResNet50

    def forward(self, x):
        return self.backbone(x)


# Classifier Class (not used in embeddings extraction but included for completeness)
class Classifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.drop_out = nn.Dropout()
        self.linear = nn.Linear(2048, num_classes)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.drop_out(x)
        x = self.linear(x)
        return x


# PancreasMRI Dataset Class
class PancreasMRIDataset(Dataset):
    def __init__(self, dataframe, image_column_name, label_column_name, transform=None):
        self.dataframe = dataframe
        self.image_column_name = image_column_name
        self.label_column_name = label_column_name
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe[self.image_column_name].iloc[idx]
        label = self.dataframe[self.label_column_name].iloc[idx]

        # Load the grayscale image
        image = Image.open(img_path).convert('L')  # Convert to grayscale

        # Convert grayscale to 3 channels
        transform_to_3_channel = transforms.Compose([
            transforms.Grayscale(num_output_channels=3)
        ])
        image = transform_to_3_channel(image)

        # Apply other transformations if specified
        if self.transform:
            image = self.transform(image)

        label = torch.tensor(label, dtype=torch.long)
        return image, label, img_path


# Pancreas Embeddings Extraction Class
class PancreasDiseaseModelEmbeddings:
    def __init__(self, train_df_path, test_df_path, image_column_name, label_column_name, batch_size=32, model_name='pancreas_mri_resnet50'):
        self.train_df_path = train_df_path
        self.test_df_path = test_df_path
        self.image_column_name = image_column_name
        self.label_column_name = label_column_name
        self.batch_size = batch_size
        self.model_name = model_name
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self._prepare_data()
        self._prepare_model()

    def _prepare_data(self):
        train_df = pd.read_csv(self.train_df_path)
        test_df = pd.read_csv(self.test_df_path)

        # Pancreas MRI Dataset
        self.train_dataset = PancreasMRIDataset(train_df, self.image_column_name, self.label_column_name, transform=self._get_transforms())
        self.test_dataset = PancreasMRIDataset(test_df, self.image_column_name, self.label_column_name, transform=self._get_transforms())

        self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=8)
        self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=8)

    def _get_transforms(self):
        return transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def _prepare_model(self):
        backbone = Backbone()
        classifier = Classifier(num_classes=2)
        model = nn.Sequential(backbone, classifier)
        model.to(self.device)
        self.model = model

        # Load the fine-tuned model
        self.model.load_state_dict(torch.load(f'{self.model_name}.pth'))
        self.model.eval()

    def extract_embeddings(self, loader):
        embeddings, labels, paths = [], [], []
        with torch.no_grad():
            for images, label_batch, path_batch in loader:  # Add path_batch
                images = images.to(self.device)
            
                # Pass through the backbone only
                x = self.model[0](images)  # Extract features from the Backbone
                x = torch.flatten(x, 1)   # Flatten after global pooling
            
                embeddings.append(x.cpu().numpy())
                labels.append(label_batch.cpu().numpy())
                paths.extend(path_batch)  # Collect the image paths
        embeddings = np.concatenate(embeddings)
        labels = np.concatenate(labels)
        return embeddings, labels, paths

    # Updated generate_embeddings_dataframe function
    def generate_embeddings_dataframe(self, embeddings, labels, paths):
        """
        Creates a Pandas DataFrame from embeddings, labels, and image paths.

        Args:
            embeddings (numpy.ndarray): The extracted embeddings.
            labels (numpy.ndarray): The labels corresponding to the embeddings.
            paths (list of str): The image paths.

        Returns:
            pd.DataFrame: A DataFrame with serialized embeddings and metadata.
        """
        # Serialize embeddings as JSON strings for safe CSV storage
        df = pd.DataFrame({
            'image_path': paths,
            'embedding': [json.dumps(emb.tolist()) for emb in embeddings],
            'label': labels
        })
        return df

    # Updated extract_and_save_embeddings function
    def extract_and_save_embeddings(self):
        """
        Extracts embeddings for train and test datasets and saves them as CSV files.

        The embeddings are serialized as JSON strings for robust CSV storage.
        """
        # Extract training embeddings
        train_embeddings, train_labels, train_paths = self.extract_embeddings(self.train_loader)
        train_df = self.generate_embeddings_dataframe(train_embeddings, train_labels, train_paths)

        # Extract test embeddings
        test_embeddings, test_labels, test_paths = self.extract_embeddings(self.test_loader)
        test_df = self.generate_embeddings_dataframe(test_embeddings, test_labels, test_paths)

        # Save DataFrames
        train_df.to_csv('train_embeddings_pancreas_nov_imbalanced.csv', index=False)
        test_df.to_csv('test_embeddings_pancreas_nov.csv', index=False)

In [4]:
if __name__ == "__main__":
    model = PancreasDiseaseModelEmbeddings(
        train_df_path='Final_Datasets/train_imbalanced_pancreas.csv',  # Path to pancreas train dataset
        test_df_path='Final_Datasets/test_data_incidence.csv',        # Path to pancreas test dataset
        image_column_name='FilePath_pancreas',                       # Pancreas image file paths
        label_column_name='CAD',                                     # CAD labels
        model_name='models/pancreas_mri_radIM_resnet50_model_nov'    # Fine-tuned pancreas model name
    )
    
    # Extract embeddings and save to CSV
    model.extract_and_save_embeddings()


  self.model.load_state_dict(torch.load(f'{self.model_name}.pth'))


In [5]:
emb = pd.read_csv('train_embeddings_pancreas_nov_imbalanced.csv')
emb['embedding'] = emb['embedding'].apply(lambda x: np.array(json.loads(x)))
emb.head()

Unnamed: 0,image_path,embedding,label
0,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20259_processed/1278310_2.jpg,"[0.08376840502023697, 0.23182588815689087, 0.4157426655292511, 0.34511280059814453, 0.12477399408817291, 0.4130540192127228, 0.08769609034061432, 0.2838345170021057, 0.1507006287574768, 0.30079424381256104, 0.19724617898464203, 0.210137739777565, 0.331701397895813, 0.4793340563774109, 0.1509784609079361, 0.3744572103023529, 0.2231331616640091, 0.19602350890636444, 0.22350211441516876, 0.1468072384595871, 0.11169625073671341, 0.4026980698108673, 0.05223402753472328, 0.1362038254737854, 0.24696047604084015, 0.2851315140724182, 0.27347201108932495, 0.23860913515090942, 0.4485560655593872, 0.2...",0
1,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20259_processed/1438790_2.jpg,"[0.12229117006063461, 0.16631604731082916, 0.44930070638656616, 0.1899174600839615, 0.299650639295578, 0.0752238929271698, 0.1451013833284378, 0.30577385425567627, 0.22282856702804565, 0.30110108852386475, 0.12316958606243134, 0.262411504983902, 0.27810198068618774, 0.25416266918182373, 0.30955275893211365, 0.2897599935531616, 0.4396466910839081, 0.4228130877017975, 0.30146321654319763, 0.48665428161621094, 0.1466827541589737, 0.10167908668518066, 0.03663846477866173, 0.14587271213531494, 0.3289724886417389, 0.41484707593917847, 0.24905268847942352, 0.2290511578321457, 0.1414261758327484, ...",1
2,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20259_processed/3658572_2.jpg,"[0.1899460107088089, 0.25007355213165283, 0.39428654313087463, 0.46462199091911316, 0.34826382994651794, 0.31689319014549255, 0.34597069025039673, 0.09446204453706741, 0.11757294833660126, 0.3327792286872864, 0.1765201836824417, 0.12134110927581787, 0.17601530253887177, 0.49678441882133484, 0.2570436894893646, 0.37172985076904297, 0.31285208463668823, 0.4544526934623718, 0.3394853174686432, 0.41629713773727417, 0.3512101471424103, 0.3386614918708801, 0.22590148448944092, 0.10689388960599899, 0.3859383761882782, 0.39429768919944763, 0.30711060762405396, 0.2784479558467865, 0.343431979417800...",0
3,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20259_processed/4630519_2.jpg,"[0.3096292316913605, 0.254027783870697, 0.3932957351207733, 0.6615026593208313, 0.7755213975906372, 0.2595810294151306, 0.5829742550849915, 0.19204969704151154, 0.3379214406013489, 0.5515918731689453, 0.3503524661064148, 0.45829078555107117, 0.07491203397512436, 0.4145667850971222, 0.4413064420223236, 0.5156205892562866, 0.1698630154132843, 0.563355028629303, 0.34559428691864014, 0.44299599528312683, 0.5953839421272278, 0.7088794112205505, 0.12809227406978607, 0.15132901072502136, 0.5825838446617126, 0.37826186418533325, 0.32645073533058167, 0.5345284342765808, 0.20239204168319702, 0.19843...",0
4,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20259_processed/2530973_2.jpg,"[0.3899383842945099, 0.5091740489006042, 0.2433774471282959, 0.5454564094543457, 0.4687931537628174, 0.3619352877140045, 0.4041050374507904, 0.12798650562763214, 0.15178386867046356, 0.3929566740989685, 0.3675544857978821, 0.14701947569847107, 0.26882582902908325, 0.28768542408943176, 0.24871031939983368, 0.35273149609565735, 0.22403603792190552, 0.2439991980791092, 0.5258687138557434, 0.6982058882713318, 0.438167005777359, 0.37069958448410034, 0.23249542713165283, 0.09709741175174713, 0.4500381350517273, 0.3395113945007324, 0.2012854963541031, 0.274767130613327, 0.19311179220676422, 0.381...",0


In [8]:
emb['embedding'][0]

array([0.08376841, 0.23182589, 0.41574267, ..., 0.23797742, 0.29331797,
       0.20705117])

In [9]:
len(emb['embedding'][0])

2048

In [6]:
emb['label'].value_counts()

label
0    26358
1     3495
Name: count, dtype: int64