In [1]:
import os
import numpy as np
import pandas as pd
import torch
import json
from PIL import Image  
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torch.nn.functional as F
import torch.nn as nn
from fastai.learner import Learner
from fastai.data.core import DataLoaders
from fastai.metrics import accuracy
from fastai.losses import CrossEntropyLossFlat
from fastai.callback.all import SaveModelCallback, EarlyStoppingCallback
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
%matplotlib inline



In [8]:
# # Load datasets
# df = pd.read_csv('Final_Datasets/liver.csv')
# df_test = pd.read_csv('Final_Datasets/test_data_incidence.csv')
# controls = pd.read_csv('Final_Datasets/imbalanced_control_iid.csv')

# # Remove test cases and controls
# test_iids = set(df_test['IID'])

# # Get all cases (CAD = 1) from liver.csv, excluding test data
# cases_train = df[(df['CAD'] == 1) & (~df['IID'].isin(test_iids))]

# # Get all controls (IID in controls.csv and CAD = 0), excluding test data
# controls_train = df[(df['IID'].isin(controls['IID'])) & (df['CAD'] == 0) & (~df['IID'].isin(test_iids))]

# # Combine the imbalanced dataset
# df_imbalanced_train = pd.concat([cases_train, controls_train]).reset_index(drop=True)

# # Save the imbalanced dataset
# df_imbalanced_train.to_csv('Final_Datasets/train_imbalanced_liver.csv', sep=',', index=False)

# # Report the dataset statistics
# cases_count = len(cases_train)
# controls_count = len(controls_train)
# print(f"Number of cases: {cases_count}")
# print(f"Number of controls: {controls_count}")
# print(f"Imbalance ratio (controls to cases): {controls_count / cases_count:.2f}")



In [10]:
# df_imbalanced_train.head()

In [11]:
# LiverMRI Dataset Class
class LiverMRIDataset(Dataset):
    def __init__(self, dataframe, image_column_name, label_column_name, transform=None):
        self.dataframe = dataframe
        self.image_column_name = image_column_name
        self.label_column_name = label_column_name
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe[self.image_column_name].iloc[idx]
        label = self.dataframe[self.label_column_name].iloc[idx]

        # Load the grayscale image
        image = Image.open(img_path).convert('L')  # Convert to grayscale

        # Convert grayscale to 3 channels
        transform_to_3_channel = transforms.Compose([
            transforms.Grayscale(num_output_channels=3)
        ])
        image = transform_to_3_channel(image)

        # Apply other transformations if specified
        if self.transform:
            image = self.transform(image)

        label = torch.tensor(label, dtype=torch.long)
        return image, label, img_path

# Backbone Class for Feature Extraction
class Backbone(nn.Module):
    def __init__(self):
        super().__init__()
        base_model = models.resnet50(pretrained=False)
        encoder_layers = list(base_model.children())
        self.backbone = nn.Sequential(*encoder_layers[:9])  # Use the first 9 layers of ResNet50

    def forward(self, x):
        return self.backbone(x)

# Classifier Class (not used in embeddings extraction but included for completeness)
class Classifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.drop_out = nn.Dropout()
        self.linear = nn.Linear(2048, num_classes)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.drop_out(x)
        x = self.linear(x)
        return x

# Liver Embeddings Extraction Class
class LiverDiseaseModelEmbeddings:
    def __init__(self, train_df_path, test_df_path, image_column_name, label_column_name, batch_size=32, model_name='liver_mri_resnet50'):
        self.train_df_path = train_df_path
        self.test_df_path = test_df_path
        self.image_column_name = image_column_name
        self.label_column_name = label_column_name
        self.batch_size = batch_size
        self.model_name = model_name
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self._prepare_data()
        self._prepare_model()

    def _prepare_data(self):
        train_df = pd.read_csv(self.train_df_path)
        test_df = pd.read_csv(self.test_df_path)

        # Liver MRI Dataset
        self.train_dataset = LiverMRIDataset(train_df, self.image_column_name, self.label_column_name, transform=self._get_transforms())
        self.test_dataset = LiverMRIDataset(test_df, self.image_column_name, self.label_column_name, transform=self._get_transforms())

        self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=8)
        self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=8)

    def _get_transforms(self):
        return transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def _prepare_model(self):
        backbone = Backbone()
        classifier = Classifier(num_classes=2)
        model = nn.Sequential(backbone, classifier)
        model.to(self.device)
        self.model = model

        # Load the fine-tuned model
        self.model.load_state_dict(torch.load(f'{self.model_name}.pth'))
        self.model.eval()

    def extract_embeddings(self, loader):
        embeddings, labels, paths = [], [], []
        with torch.no_grad():
            for images, label_batch, path_batch in loader:  # Add path_batch
                images = images.to(self.device)
            
                # Pass through the backbone only
                x = self.model[0](images)  # Extract features from the Backbone
                x = torch.flatten(x, 1)   # Flatten after global pooling
            
                embeddings.append(x.cpu().numpy())
                labels.append(label_batch.cpu().numpy())
                paths.extend(path_batch)  # Collect the image paths
        embeddings = np.concatenate(embeddings)
        labels = np.concatenate(labels)
        return embeddings, labels, paths



    # Updated generate_embeddings_dataframe function
    def generate_embeddings_dataframe(self, embeddings, labels, paths):
        """
        Creates a Pandas DataFrame from embeddings, labels, and image paths.

        Args:
            embeddings (numpy.ndarray): The extracted embeddings.
            labels (numpy.ndarray): The labels corresponding to the embeddings.
            paths (list of str): The image paths.

        Returns:
            pd.DataFrame: A DataFrame with serialized embeddings and metadata.
        """
        # Serialize embeddings as JSON strings for safe CSV storage
        df = pd.DataFrame({
            'image_path': paths,
            'embedding': [json.dumps(emb.tolist()) for emb in embeddings],
            'label': labels
        })
        return df

     #Updated extract_and_save_embeddings function
    def extract_and_save_embeddings(self):
        """
        Extracts embeddings for train and test datasets and saves them as CSV files.

        The embeddings are serialized as JSON strings for robust CSV storage.
        """
        # Extract training embeddings
        train_embeddings, train_labels, train_paths = self.extract_embeddings(self.train_loader)
        train_df = self.generate_embeddings_dataframe(train_embeddings, train_labels, train_paths)

        # Extract test embeddings
        test_embeddings, test_labels, test_paths = self.extract_embeddings(self.test_loader)
        test_df = self.generate_embeddings_dataframe(test_embeddings, test_labels, test_paths)

        # Save DataFrames to CSV with JSON-serialized embeddings
        train_df.to_csv('train_embeddings_liver_nov_imbalanced.csv', index=False)
        test_df.to_csv('test_embeddings_liver_nov.csv', index=False)

In [12]:
if __name__ == "__main__":
    model = LiverDiseaseModelEmbeddings(
        train_df_path='Final_Datasets/train_imbalanced_liver.csv',  # Path to liver train dataset
        test_df_path='Final_Datasets/test_data_incidence.csv',  # Path to liver test dataset
        image_column_name='FilePath_liver',                    # Liver image file paths
        label_column_name='CAD',                               # CAD labels
        model_name='models/liver_mri_radIM_resnet50_model_nov'        # Fine-tuned liver model name
    )
    
    # Extract embeddings and save to CSV
    model.extract_and_save_embeddings()


  self.model.load_state_dict(torch.load(f'{self.model_name}.pth'))


In [2]:
emb = pd.read_csv('train_embeddings_liver_nov_imbalanced.csv')
emb['embedding'] = emb['embedding'].apply(lambda x: np.array(json.loads(x)))
emb.head()

Unnamed: 0,image_path,embedding,label
0,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20204_2_cropped/2283985.jpg,"[0.048257745802402496, 0.17932827770709991, 0.2461884319782257, 0.18811623752117157, 0.05363137647509575, 0.22287796437740326, 0.16087187826633453, 0.19072483479976654, 0.12066276371479034, 0.24892660975456238, 0.021761098876595497, 0.29642313718795776, 0.165064737200737, 0.4340325593948364, 0.26540088653564453, 0.2743842303752899, 0.138432577252388, 0.3450947105884552, 0.21279335021972656, 0.16314095258712769, 0.1393556147813797, 0.1841920167207718, 0.22848498821258545, 0.14593391120433807, 0.45362401008605957, 0.10949797183275223, 0.20782503485679626, 0.3812257647514343, 0.33487343788146...",0
1,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20204_2_cropped/1414804.jpg,"[0.49649253487586975, 0.5446419715881348, 0.33405622839927673, 0.678919792175293, 0.8624733090400696, 0.3321053683757782, 0.382202684879303, 0.1793617308139801, 0.2632859945297241, 0.2910124361515045, 0.16421186923980713, 0.10440715402364731, 0.1475459635257721, 0.3331582844257355, 0.29979977011680603, 0.8486292958259583, 0.21641682088375092, 0.514647364616394, 0.4642202854156494, 0.6870560050010681, 0.24264007806777954, 0.4795040786266327, 0.05379239842295647, 0.10512242466211319, 0.3960431218147278, 0.19685517251491547, 0.5484703779220581, 0.4000292420387268, 0.11509321630001068, 0.44486...",0
2,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20204_2_cropped/2008383.jpg,"[0.4503157436847687, 0.25143322348594666, 0.3439211845397949, 0.5088186860084534, 0.3570999503135681, 0.3991461396217346, 0.33143702149391174, 0.2714579105377197, 0.25446099042892456, 0.5045296549797058, 0.370547890663147, 0.32849961519241333, 0.12461522221565247, 0.5313747525215149, 0.46944621205329895, 0.6066666841506958, 0.3636687994003296, 0.0677860826253891, 0.28542250394821167, 0.4170055687427521, 0.3903954029083252, 0.5325046181678772, 0.21283753216266632, 0.06155705824494362, 0.5240745544433594, 0.101762555539608, 0.35842669010162354, 0.4869183301925659, 0.06481748819351196, 0.2946...",0
3,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20204_2_cropped/4557403.jpg,"[0.454449325799942, 0.22551193833351135, 0.30657628178596497, 0.6026245355606079, 0.6241907477378845, 0.31453508138656616, 0.5085590481758118, 0.2247134894132614, 0.4495599567890167, 0.19871528446674347, 0.3039184510707855, 0.12345006316900253, 0.13531652092933655, 0.6168326139450073, 0.3546350300312042, 0.6485339999198914, 0.26137158274650574, 0.34863513708114624, 0.2726458013057709, 0.6379393339157104, 0.5042950510978699, 0.41701602935791016, 0.1594209372997284, 0.099574513733387, 0.3822794556617737, 0.18017825484275818, 0.4591946303844452, 0.32769888639450073, 0.16499248147010803, 0.139...",0
4,/corral-repl/utexas/UKB-Imaging-Genetics/Imaging_Data/processed_imaging_data/MRI/Abdominal/field_20204_2_cropped/5959204.jpg,"[0.2629716396331787, 0.30035993456840515, 0.20724470913410187, 0.49216428399086, 0.6814802289009094, 0.0695125088095665, 0.16787882149219513, 0.08988457173109055, 0.1653607338666916, 0.21452203392982483, 0.12987765669822693, 0.16204451024532318, 0.17137178778648376, 0.4457346498966217, 0.14675098657608032, 0.50738126039505, 0.3554674983024597, 0.29122301936149597, 0.22758160531520844, 0.5015839338302612, 0.323143869638443, 0.20330536365509033, 0.04844409599900246, 0.045219000428915024, 0.25817713141441345, 0.23309996724128723, 0.4273673892021179, 0.25063008069992065, 0.30794137716293335, 0...",1


In [3]:
len(emb['embedding'][0])

2048

In [16]:
emb['label'].value_counts()

label
0    26358
1     4342
Name: count, dtype: int64