In [6]:
import glob
import os
import warnings
import mlflow
import mlflow.pytorch 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydicom
from pydicom.data import get_testdata_files
from sklearn.model_selection import train_test_split
from torchvision import transforms
from IPython.display import Markdown, display

 # Import functions from the module
import importlib
import help_files._0_definitions 
import  help_files._1_visuals_script
# import  help_files._01_load_data
 # Reload the module to apply the changes to the script
importlib.reload(help_files._0_definitions)
importlib.reload(help_files._1_visuals_script)
# importlib.reload(help_files._01_load_data)
import  help_files._1_visuals_script  as pauls_vs
# Group by 'condition', 'level', and 'severity' and count occurrences
from help_files._0_definitions import count_severity_by_condition_level 
# Define the path
from pathlib import Path

pd.set_option("display.width", 1000)  # Set a large width to prevent line wrapping
 

In [7]:
### In definitions are all the functions that are used in the notebook and globals
with open("help_files/_0_definitions.py") as file:
    exec(file.read())
    ### In definitions are all the functions that are used in the notebook and globals
with open("help_files/_0_run_definitions.py") as file:
    exec(file.read())

In [8]:
# loading data
file_names = ["train_df_3_cat.csv", "test_df_3_cat.csv"]
# Load the data from the CSV files
dataframes = [pd.read_csv(data_path_vor / file_name) for file_name in file_names]
# Unpack the dataframes into separate variables
train_df, test_df = dataframes

print("DataFrames have been loaded successfully.")


DataFrames have been loaded successfully.


In [9]:
# Defining small sample vs. end smaple
whole_data_set = False
# end sample or small sample    
if whole_data_set == True:
    print("Using the whole data set")
else:
    train_df = train_df.sample(n=420, random_state=RSEED)
    test_df = test_df.sample(n=420, random_state=RSEED)
    display(Markdown('<span style="color:red"> this is a small sample : 48692</span>'))

<span style="color:red"> this is a small sample : 48692</span>

Calss definition dataloader (do not change over models)

Step 1: U-Net Model and Data Preparation

In [12]:
import torch
import torch.optim as optim
import torch.nn as nn
import os
import mlflow
import mlflow.pytorch
import matplotlib.pyplot as plt
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
from PIL import Image
import pydicom

class MRILocalizationDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.data.iloc[idx]['image_path']
        dicom_image = pydicom.dcmread(img_path)
        image_array = dicom_image.pixel_array
        image = Image.fromarray(image_array)

        if image.mode != 'RGB':
            image = image.convert('RGB')

        if self.transform:
            image = self.transform(image)

        x = torch.tensor(self.data.iloc[idx]['x'], dtype=torch.float32)
        y = torch.tensor(self.data.iloc[idx]['y'], dtype=torch.float32)

        return image, torch.tensor([x, y]) 

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=1),  # Ensures single channel output
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])  # Example values for grayscale images
])

# Create the dataset
dataset = MRILocalizationDataset(data=train_df, transform=transform)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
num_epochs = 10
learning_rate = 0.0001
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = UNet(in_channels=1, out_channels=2)  # Single channel input for grayscale images

model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

experiment_name = "MRI_Localization_UNet"
mlflow.set_experiment(experiment_name)

# Define an input example for MLflow
input_example = torch.randn(1, 3, 224, 224, device=device)  # Random tensor simulating one sample image

with mlflow.start_run():
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("optimizer", "Adam")
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("num_epochs", num_epochs)
    mlflow.log_param("model_architecture", "U-Net for Localization")
    mlflow.log_param("output_coordinates", "x, y")

    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for images, coordinates in train_loader:
            images, coordinates = images.to(device), coordinates.to(device)
            optimizer.zero_grad()
            outputs = model(images)

            # Flatten the output tensor to match the target shape (batch_size, 2)
            outputs = outputs.view(outputs.size(0), -1)

            # Ensure the shape of outputs matches the target (x, y)
            if outputs.size(1) != 2:
                outputs = outputs[:, :2]  # Only take the first 2 channels if the output size is incorrect

            loss = criterion(outputs, coordinates)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        mlflow.log_metric("train_loss", train_loss, step=epoch)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, coordinates in val_loader:
                images, coordinates = images.to(device), coordinates.to(device)
                outputs = model(images)

                # Flatten the output tensor to match the target shape (batch_size, 2)
                outputs = outputs.view(outputs.size(0), -1)
                if outputs.size(1) != 2:
                    outputs = outputs[:, :2]

                loss = criterion(outputs, coordinates)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        val_losses.append(val_loss)
        mlflow.log_metric("val_loss", val_loss, step=epoch)

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Log the model with input_example
    input_example_np = input_example.cpu().numpy()  # Convert tensor to numpy array
    mlflow.pytorch.log_model(model, "model", input_example=input_example_np)
mlflow.pytorch.log_model(model, "model", input_example=input_example)

    # Plot and save the loss curves
    plt.figure(figsize=(10, 5))
    plt.plot(range(num_epochs), train_losses, label='Train Loss')
    plt.plot(range(num_epochs), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.savefig("loss_curve.png")
    mlflow.log_artifact("loss_curve.png")


KeyboardInterrupt: 

predictions

In [31]:
train_df

Unnamed: 0,study_id,severity,condition,level,series_id,x,y,image_path,missing_image
1935,684747623,2.0,0,l5/s1,3273038000.0,370.053272,506.149133,data/train_images_origin/684747623/774494956/7...,False
6494,2325650566,1.0,0,l5/s1,2076691000.0,293.720819,472.222032,data/train_images_origin/2325650566/20477869/5...,False
1720,624881903,2.0,0,l5/s1,1233162000.0,375.776524,475.359848,data/train_images_origin/624881903/1233161684/...,False
9120,3221995449,1.0,0,l5/s1,215697700.0,231.017145,348.249377,data/train_images_origin/3221995449/215697714/...,False
360,117720278,0.0,0,l5/s1,4077719000.0,250.7559,308.758838,data/train_images_origin/117720278/2514759683/...,False
9663,3421594215,2.0,0,l5/s1,1330621000.0,253.695447,327.937206,data/train_images_origin/3421594215/31901941/1...,False
5277,1868615696,0.0,0,l5/s1,81593100.0,308.551033,493.129771,data/train_images_origin/1868615696/81593102/5...,False
8546,3039159884,0.0,0,l5/s1,4236084000.0,186.966192,267.160142,data/train_images_origin/3039159884/4236083882...,False
2221,777619397,1.0,0,l5/s1,2434669000.0,258.468516,307.391521,data/train_images_origin/777619397/2434668903/...,False
4617,1616262169,0.0,0,l5/s1,61222130.0,265.810301,334.945509,data/train_images_origin/1616262169/61222130/5...,False


test predicion

In [40]:

test_df = test_df.drop(['severity', 'condition', 'level', 'series_id', 'missing_image'], axis=1)


KeyError: "['severity', 'condition', 'level', 'series_id', 'missing_image'] not found in axis"

In [51]:
print(test_df.to_string(index=False, header=True)) 

  study_id  severity  condition level    series_id          x          y                                            image_path  missing_image
2775207739       1.0          0 l5/s1 3249541180.0 157.538462 250.197802  data/train_images_origin/2775207739/3249541180/8.dcm          False
 664153360       2.0          0 l5/s1 1076245514.0 260.194392 306.003738  data/train_images_origin/664153360/1076245514/13.dcm          False
2273432465       1.0          0 l5/s1 1257234944.0 234.036660 287.804481   data/train_images_origin/2273432465/114306451/9.dcm          False
3777149998       0.0          0 l5/s1  735583429.0 151.528158 190.928463  data/train_images_origin/3777149998/3550597941/8.dcm          False
3967802493       0.0          0 l5/s1 1589249065.0 265.649924 385.753425 data/train_images_origin/3967802493/2054070341/11.dcm          False
2780118855       0.0          0 l5/s1 3206608810.0 189.718464 238.157221 data/train_images_origin/2780118855/2064060968/14.dcm          False
 24796

predicting

In [52]:
test_data = test_df

In [53]:
import numpy as np
import pandas as pd
import torch
import mlflow
from torch.utils.data import DataLoader

# Load the trained model from MLflow
model = mlflow.pytorch.load_model("runs:/ec2baecd43734618aedf3e465aec6693/model")
model.eval()  # Set the model to evaluation mode

# Prepare the new test dataset (without coordinates) and DataLoader
test_dataset = MRILocalizationDataset(data=test_df, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Ensure the model is on the right device

# Model outputs a heatmap instead of coordinates
predicted_coords = []
with torch.no_grad():  # Disable gradient computation for inference
    for images, _ in test_loader:  # Unpack images and ignore labels
        images = images.to(device)
        heatmaps = model(images)  # Get model output as heatmap
        heatmaps = heatmaps.cpu().numpy()  # Move to CPU for processing
        
        for heatmap in heatmaps:
            # Find the brightest point (peak) in the heatmap
            y, x = np.unravel_index(np.argmax(heatmap, axis=None), heatmap.shape)
            predicted_coords.append((x, y))  # Store (x, y) coordinates of the peak

# Save predicted coordinates along with original test data
output_df = pd.DataFrame(predicted_coords, columns=['x', 'y'])
test_df_with_preds = pd.concat([test_df.reset_index(drop=True), output_df], axis=1)
test_df_with_preds.to_csv("test_with_predicted_coordinates.csv", index=False)

print("Inference complete. Predicted coordinates saved.")


ValueError: not enough values to unpack (expected 2, got 1)

In [54]:
import numpy as np
import pandas as pd
import torch

from torch.utils.data import DataLoader

model = mlflow.pytorch.load_model("runs:/ec2baecd43734618aedf3e465aec6693/model")
model.eval()

# Prepare the new test dataset (without coordinates) and DataLoader
test_dataset = MRILocalizationDataset(data=test_df, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model outputs a heatmap instead of coordinates
predicted_coords = []
with torch.no_grad():  # Disable gradient computation for inference
    for images, _ in test_loader:  # Unpack images and ignore labels
        images = images.to(device)
        heatmaps = model(images)  # Get model output as heatmap
        heatmaps = heatmaps.cpu().numpy()  # Move to CPU for processing
        
        for heatmap in heatmaps:
            # Find the brightest point (peak) in the heatmap
            y, x = np.unravel_index(np.argmax(heatmap, axis=None), heatmap.shape)
            predicted_coords.append((x, y))

# Save predicted coordinates along with original test data
output_df = pd.DataFrame(predicted_coords, columns=['x', 'y'])
test_df_with_preds = pd.concat([test_df.reset_index(drop=True), output_df], axis=1)
test_df_with_preds.to_csv("test_with_predicted_coordinates.csv", index=False)

print("Inference complete. Predicted coordinates saved.")


ValueError: not enough values to unpack (expected 2, got 1)

In [55]:
# Load the model from MLflow
from torch.utils.data import DataLoader

model = mlflow.pytorch.load_model("runs:/ec2baecd43734618aedf3e465aec6693/model")
model.eval()

# Prepare the new test dataset (without coordinates) and DataLoader
test_dataset = MRILocalizationDataset(data=test_df, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Inference on new test data
predicted_coords = []
with torch.no_grad():  # Disable gradient computation for inference
    for images, _ in test_loader:  # Unpack images and ignore labels
        images = images.to(device)
        outputs = model(images)  # Get model output
        outputs = outputs.view(outputs.size(0), -1)  # Reshape to (batch_size, 2) for x, y coordinates
        predicted_coords.extend(outputs.cpu().numpy())  # Move to CPU and add to results

# Save predicted coordinates
import pandas as pd
output_df = pd.DataFrame(predicted_coords, columns=['x', 'y'])
output_df.to_csv("predicted_coordinates.csv", index=False)

print("Inference complete. Predicted coordinates saved.")


Inference complete. Predicted coordinates saved.


In [57]:
import numpy as np
import pandas as pd
import torch
import mlflow
from torch.utils.data import DataLoader

# Load the trained model from MLflow
model = mlflow.pytorch.load_model("runs:/ec2baecd43734618aedf3e465aec6693/model")
model.eval()  # Set the model to evaluation mode

# Prepare the new test dataset (without coordinates) and DataLoader
test_dataset = MRILocalizationDataset(data=test_df, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Ensure the model is on the right device

# Inference on new test data
predicted_coords = []
with torch.no_grad():  # Disable gradient computation for inference
    for images, _ in test_loader:  # Unpack images and ignore labels
        images = images.to(device)
        outputs = model(images)  # Get model output
        outputs = outputs.view(outputs.size(0), -1)  # Reshape to (batch_size, 2) for x, y coordinates
        predicted_coords.extend(outputs.cpu().numpy())  # Move to CPU and add to results

# Convert the predicted coordinates to a DataFrame
output_df = pd.DataFrame(predicted_coords, columns=['x', 'y'])

# Combine original test_df with predicted coordinates
test_df_with_preds = pd.concat([test_df.reset_index(drop=True), output_df], axis=1)

# Save the resulting dataframe to a CSV file
test_df_with_preds.to_csv("test_with_predicted_coordinates.csv", index=False)

print("Inference complete. Predicted coordinates saved.")


Inference complete. Predicted coordinates saved.


In [58]:
test_df_with_preds

Unnamed: 0,study_id,severity,condition,level,series_id,x,y,image_path,missing_image,x.1,y.1
0,2775207739,1.0,0,l5/s1,3.249541e+09,157.538462,250.197802,data/train_images_origin/2775207739/3249541180...,False,82.500107,293.975647
1,664153360,2.0,0,l5/s1,1.076246e+09,260.194392,306.003738,data/train_images_origin/664153360/1076245514/...,False,256.228149,285.366211
2,2273432465,1.0,0,l5/s1,1.257235e+09,234.036660,287.804481,data/train_images_origin/2273432465/114306451/...,False,203.528595,278.248077
3,3777149998,0.0,0,l5/s1,7.355834e+08,151.528158,190.928463,data/train_images_origin/3777149998/3550597941...,False,250.990265,284.886017
4,3967802493,0.0,0,l5/s1,1.589249e+09,265.649924,385.753425,data/train_images_origin/3967802493/2054070341...,False,291.303436,277.414490
...,...,...,...,...,...,...,...,...,...,...,...
415,626906174,0.0,0,l5/s1,1.645382e+09,282.830769,363.815385,data/train_images_origin/626906174/3362073405/...,False,328.733063,193.168823
416,3231592574,2.0,0,l5/s1,4.096628e+07,209.657025,240.132231,data/train_images_origin/3231592574/40966279/3...,False,259.041473,302.603577
417,332284668,1.0,0,l5/s1,1.754494e+09,340.733632,503.927515,data/train_images_origin/332284668/1754494354/...,False,240.313980,163.705917
418,1874721938,2.0,0,l5/s1,6.611927e+07,152.009078,226.750524,data/train_images_origin/1874721938/66119274/5...,False,309.087158,203.663834
