In [1]:
import os
import datetime
import copy
import re
import yaml
import uuid
import warnings
import time
import inspect

import numpy as np
import pandas as pd
from functools import partial, reduce
from random import shuffle
import random

import torch
from torch import nn, optim
from torch import nn
from torch.nn import functional as F
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader
from torchvision.models import resnet
from torchvision.transforms import Compose, ToTensor, Normalize, Resize
from torchvision.models.resnet import ResNet, BasicBlock
from torchvision.datasets import MNIST
from tqdm.autonotebook import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn import metrics as mtx
from sklearn import model_selection as ms
import os
import shutil
import warnings
import torch
import torch.nn as nn
import torchvision.models as models
from sklearn.preprocessing import StandardScaler
import numpy as np

warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm


In [41]:
# This function is to flatten images
source_folder = "/home/lujun/local/DLMI-Classification/data/raw/testset/"
destination_folder = "/home/lujun/local/DLMI-Classification/data/raw/testset_flattened/"
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)
    for person_folder in os.listdir(source_folder):
        person_folder_path = os.path.join(source_folder, person_folder)
        if os.path.isdir(person_folder_path):
            for file in os.listdir(person_folder_path):
                file_path = os.path.join(person_folder_path, file)
                if os.path.isfile(file_path):
                    target_file_name = f"{person_folder}_{file}"
                    target_file_path = os.path.join(
                        destination_folder, target_file_name
                    )
                    shutil.copy(file_path, target_file_path)
print("Flattened Images has benn done", destination_folder)

Flattened Images has benn done /home/lujun/local/DLMI-Classification/data/raw/testset_flattened/


In [45]:
clinical_annotation_dataframe = pd.read_csv(
    "/home/lujun/local/DLMI-Classification/data/raw/clinical_annotation.csv"
)

clinical_post_processing = pd.DataFrame()

for index, row in clinical_annotation_dataframe.iterrows():
    target_folder = (
        "/home/lujun/local/DLMI-Classification/data/raw/testset/" + row["ID"] + "/"
    )
    if os.path.exists(target_folder):
        for file in os.listdir(target_folder):
            new_row = row.copy()
            file_path = os.path.join(target_folder, file)
            if os.path.isfile(file_path):
                new_row["image_id"] = f"{row.ID}_{file[:-4]}"
                clinical_post_processing = clinical_post_processing.append(
                    new_row, ignore_index=True
                )

clinical_post_processing.drop(columns=["Unnamed: 0"], inplace=True)
clinical_post_processing.to_csv("clinical_annotation_test.csv")

In [22]:
from PIL import Image
from datetime import datetime


def calculate_age(dob_str):
    formats = ["%m/%d/%Y", "%d-%m-%Y"]  # Two different format at the same time!
    for format_string in formats:
        try:
            dob_date = datetime.strptime(dob_str, format_string)
            current_date = datetime.now()
            age = (
                current_date.year
                - dob_date.year
                - (
                    (current_date.month, current_date.day)
                    < (dob_date.month, dob_date.day)
                )
            )
            return age
        except ValueError:
            continue


def encode_gender(gender):
    if gender == "F" or gender == "f":  # Two different gender value F and f ....
        return 0
    elif gender == "M" or gender == "m":
        return 1
    else:
        raise ValueError(
            "Invalid gender value. Expected 'F' or 'M', but received: {}".format(gender)
        )


class DLMICustomDataset(Dataset):
    def __init__(self, data, transform=None, flag="trainset_flattened"):
        self.data = data
        self.transform = transform
        self.flag = flag

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_ID = self.data.iloc[idx, 6]
        img_path = "data/post-processed/" + self.flag + "/" + str(img_ID) + ".jpg"
        # Assuming the image path is in the first column
        image = Image.open(img_path).convert("RGB")
        label_value = self.data.iloc[idx, 2]
        label = torch.zeros(1, 2)
        if label_value == 0:
            label[0, 0] = 1
        else:
            label[0, 1] = 1
        # Assuming label is in the second column
        if self.transform:
            image = self.transform(image)
        gender = torch.tensor(encode_gender(self.data.iloc[idx, 3]), dtype=torch.long)
        age = torch.tensor(calculate_age(self.data.iloc[idx, 4]), dtype=torch.float32)
        lymph_count = torch.tensor(self.data.iloc[idx, 5], dtype=torch.float32)
        clinical_data = torch.stack((gender, age, lymph_count))

        return image, clinical_data, label

In [23]:
import torch.nn.init as init


class HybridModel(nn.Module):
    def __init__(self, num_classes, mlp_input_dim, mlp_hidden_dim):
        super(HybridModel, self).__init__()
        self.resnet18 = models.resnet18(pretrained=False)
        num_ftrs = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Linear(num_ftrs, num_ftrs)
        self.mlp = MLP(
            input_dim=mlp_input_dim, hidden_dim=mlp_hidden_dim, output_dim=num_classes
        )

        self.linear_layers = nn.ModuleList(
            [
                nn.Linear(num_ftrs if i == 0 else num_classes, num_classes)
                for i in range(4)
            ]
        )
        for layer in self.linear_layers:
            init.xavier_uniform_(layer.weight)
            init.zeros_(layer.bias)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, image_data, clinical_data):
        image_output = self.resnet18(image_data)
        for layer in self.linear_layers:
            image_output = layer(image_output)
        mlp_output = self.mlp(clinical_data)
        image_output = self.sigmoid(image_output)
        mlp_output = self.sigmoid(mlp_output)
        final_output = (image_output + mlp_output) / 2
        return final_output


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

        # Initialize the weights for the linear layers
        init.xavier_uniform_(self.fc1.weight)
        init.zeros_(self.fc1.bias)
        init.xavier_uniform_(self.fc2.weight)
        init.zeros_(self.fc2.bias)
        init.xavier_uniform_(self.fc3.weight)
        init.zeros_(self.fc3.bias)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Training

overfittting problem

The model structure is good or not?

Data augmentation is avaialble or not?


learning rate is good or not? 

data is imbalanced 

Is the modle should be frozen

Initiallise the weight： 

In [None]:
def calculate_accuracy(outputs, labels):
    predicted_classes = torch.argmax(outputs, dim=1)
    true_classes = torch.argmax(labels, dim=1)
    accuracy = (predicted_classes == true_classes).float().mean().item()
    return accuracy

In [33]:
import torchvision.transforms as transforms
from torchvision.datasets import DatasetFolder
from torch.utils.data import random_split


# High learning rate high accuracy but overfitting 0.1 too high
# 0.05 a good choice
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = pd.read_csv(
    "/home/lujun/local/DLMI-Classification/data/post-processed/clinical_annotation_train.csv"
)
transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
custom_dataset = DLMICustomDataset(data=data, transform=transform)

train_size = int(0.8 * len(custom_dataset))
val_size = len(custom_dataset) - train_size
train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=12, shuffle=False)

model = HybridModel(num_classes=2, mlp_input_dim=3, mlp_hidden_dim=6).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 100
best_loss = float("inf")
unchanged_count = 0
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, clinical, labels in train_loader:
        inputs, clinical, labels = (
            inputs.to(device),
            clinical.to(device),
            labels.to(device),
        )
        optimizer.zero_grad()
        outputs = model(inputs, clinical)
        labels = labels.squeeze(1)
        labels.requires_grad = True
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

    # Validation
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    validation_accuracy = []
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, clinical, labels in val_loader:
            inputs, clinical, labels = (
                inputs.to(device),
                clinical.to(device),
                labels.to(device),
            )
            outputs = model(inputs, clinical)
            labels = labels.squeeze(1)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            total += labels.size(0)
            validation_accuracy.append(calculate_accuracy(outputs, labels))
    val_loss = val_loss / len(val_loader.dataset)  # Calculate validation loss2
    val_accuracy = sum(validation_accuracy) / len(validation_accuracy)
    print(
        f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}"
    )

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        unchanged_count = 0
    else:
        unchanged_count += 1

    if unchanged_count >= 5:
        print("Loss has not changed for 5 consecutive epochs. Stopping training.")
        break

Epoch [1/100], Loss: 0.3891
Epoch [1/100], Train Loss: 0.3891, Val Loss: 0.3330, Val Accuracy: 0.8281
Epoch [2/100], Loss: 0.3349
Epoch [2/100], Train Loss: 0.3349, Val Loss: 0.2985, Val Accuracy: 0.8815
Epoch [3/100], Loss: 0.3020
Epoch [3/100], Train Loss: 0.3020, Val Loss: 0.3209, Val Accuracy: 0.8481
Epoch [4/100], Loss: 0.2741
Epoch [4/100], Train Loss: 0.2741, Val Loss: 0.2722, Val Accuracy: 0.8800
Epoch [5/100], Loss: 0.2655
Epoch [5/100], Train Loss: 0.2655, Val Loss: 0.2793, Val Accuracy: 0.8756
Epoch [6/100], Loss: 0.2619
Epoch [6/100], Train Loss: 0.2619, Val Loss: 0.2798, Val Accuracy: 0.8715
Epoch [7/100], Loss: 0.2527
Epoch [7/100], Train Loss: 0.2527, Val Loss: 0.2798, Val Accuracy: 0.8767
Epoch [8/100], Loss: 0.2467
Epoch [8/100], Train Loss: 0.2467, Val Loss: 0.2593, Val Accuracy: 0.8856
Epoch [9/100], Loss: 0.2401
Epoch [9/100], Train Loss: 0.2401, Val Loss: 0.2788, Val Accuracy: 0.8719
Epoch [10/100], Loss: 0.2317
Epoch [10/100], Train Loss: 0.2317, Val Loss: 0.2826,

In [34]:
# Save the trained model
torch.save(model.state_dict(), "trained_model_final_epoche_100_un_pretrained.pth")

In [36]:
# Evaluate the model on the test set and output predictions
model.eval()
all_predictions = []
test_data = pd.read_csv(
    "/home/lujun/local/DLMI-Classification/data/post-processed/clinical_annotation_test.csv"
)
test_dataset = DLMICustomDataset(
    data=test_data, transform=transform, flag="testset_flattened"
)

with torch.no_grad():
    all_predictions = []  # Initialize list to store predictions
    for i in range(len(test_dataset)):
        inputs, clinical, _ = test_dataset[i]  # Get inputs, clinical data, and label
        inputs, clinical = inputs.unsqueeze(0).to(device), clinical.unsqueeze(0).to(
            device
        )  # Add batch dimension and move to device
        outputs = model(inputs, clinical)  # Forward pass
        all_predictions.append(outputs)  # Append predicted class to list
        # Print example outputs for debugging
        if i == 0:
            print("Example combined outputs:", outputs)

# Output all predictions
print("All Predictions:", all_predictions)

Example combined outputs: tensor([[0.0705, 0.9178]], device='cuda:0')
All Predictions: [tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.5686, 0.4189]], device='cuda:0'), tensor([[0.2034, 0.7632]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.5705, 0.4178]], device='cuda:0'), tensor([[0.5199, 0.4529]], device='cuda:0'), tensor([[0.5705, 0.4178]], device='cuda:0'), tensor([[0.0808, 0.9060]], device='cuda:0'), tensor([[0.2149, 0.7594]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.5705, 0.4178]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.5571, 0.4267]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.0705, 0.9178]], device='cuda:0'), tensor([[0.0

In [62]:
from collections import defaultdict

# for i in range(len(all_predictions)):
#     all_predictions[i] = all_predictions[i].cpu().numpy()

person_predictions = defaultdict(list)
images_names = []
source_folder_path = (
    "/home/lujun/local/DLMI-Classification/data/post-processed/testset_flattened"
)
for person_folder in os.listdir(source_folder_path):
    images_names.append(person_folder[:-4])

for filename, prediction in zip(images_names, all_predictions):
    person_number = filename.split("_")[0]  # Extract person number from filename
    person_predictions[person_number].append(prediction)

person_labels_1 = defaultdict(int)
person_labels_0 = defaultdict(int)

for person_number, predictions in person_predictions.items():
    for prediction in predictions:
        max_index = np.argmax(prediction)
        if max_index == 1:
            person_labels_1[person_number] += 1
        elif max_index == 0:
            person_labels_0[person_number] += 1

# Output the final labels after majority voting
# for person_number, label in person_labels.items():
#     print(f"Person {person_number}: Predicted Label {label}")

In [79]:
data = {
    "person_number": list(person_labels_1.keys()),
    "person_labels_1": list(person_labels_1.values()),
    "person_labels_0": list(person_labels_0.values()),
}

df = pd.DataFrame(data)

In [81]:
df["percentage_1"] = df.person_labels_1 / (df.person_labels_1 + df.person_labels_0)
df["percentage_0"] = df.person_labels_0 / (df.person_labels_1 + df.person_labels_0)

In [91]:
df["percentage_1"] = df["percentage_1"] * 50 / 163
df["percentage_0"] = df["percentage_0"] * 113 / 163

Unnamed: 0,person_number,person_labels_1,person_labels_0,percentage_1,percentage_0
0,P152,30,11,0.22445,0.185994
1,P57,50,19,0.222281,0.190895
2,P108,69,23,0.230061,0.173313
3,P143,84,19,0.250164,0.127881
4,P49,36,15,0.216528,0.203898
5,P81,111,26,0.248533,0.131566
6,P18,43,15,0.227417,0.179289
7,P175,84,28,0.230061,0.173313
8,P69,70,15,0.252616,0.122339
9,P16,134,32,0.247616,0.133639


In [92]:
df["label"] = df["percentage_1"] > df["percentage_0"]
df

Unnamed: 0,person_number,person_labels_1,person_labels_0,percentage_1,percentage_0,label
0,P152,30,11,0.22445,0.185994,True
1,P57,50,19,0.222281,0.190895,True
2,P108,69,23,0.230061,0.173313,True
3,P143,84,19,0.250164,0.127881,True
4,P49,36,15,0.216528,0.203898,True
5,P81,111,26,0.248533,0.131566,True
6,P18,43,15,0.227417,0.179289,True
7,P175,84,28,0.230061,0.173313,True
8,P69,70,15,0.252616,0.122339,True
9,P16,134,32,0.247616,0.133639,True


In [93]:
df["label"] = df["label"].map({True: 1, False: 0})

In [96]:
output_filename = "output.csv"
df[["person_number", "label"]].to_csv(
    output_filename, index=False, header=["ID", "Predicted"]
)

In [84]:
clinical_annotation_dataframe = pd.read_csv(
    "/home/lujun/local/DLMI-Classification/data/raw/clinical_annotation.csv"
)

In [90]:
clinical_annotation_dataframe["LABEL"].value_counts()

 1    113
 0     50
-1     42
Name: LABEL, dtype: int64

In [71]:
# import pandas as pd
# from sklearn.cluster import KMeans


# num_clusters = 2
# kmeans = KMeans(n_clusters=num_clusters)
# X = df[["person_labels_1", "person_labels_0"]]
# kmeans.fit(X)

# clusters = kmeans.predict(X)

# df["cluster"] = clusters

# for cluster_num in range(num_clusters):
#     cluster_data = df[df["cluster"] == cluster_num]
#     print(f"Cluster {cluster_num + 1}:")
#     print(cluster_data)
#     print("\n")

Cluster 1:
   person_number  person_labels_1  person_labels_0  cluster
3           P143               84               19        0
5            P81              111               26        0
7           P175               84               28        0
9            P16              134               32        0
15          P172              111               34        0
16           P75              128               39        0
17          P132              108               26        0
19           P32              116               26        0
20          P120              137               27        0
25          P195              139               41        0
29           P92               91               24        0


Cluster 2:
   person_number  person_labels_1  person_labels_0  cluster
0           P152               30               11        1
1            P57               50               19        1
2           P108               69               23        1
4            P49

In [74]:
output_filename = "output.csv"
df[["person_number", "cluster"]].to_csv(
    output_filename, index=False, header=["ID", "LABEL"]
)

In [65]:
person_labels_0

defaultdict(int,
            {'P152': 11,
             'P57': 19,
             'P108': 23,
             'P143': 19,
             'P49': 15,
             'P81': 26,
             'P18': 15,
             'P175': 28,
             'P69': 15,
             'P16': 32,
             'P170': 7,
             'P119': 14,
             'P9': 15,
             'P56': 15,
             'P71': 12,
             'P172': 34,
             'P75': 39,
             'P132': 26,
             'P203': 18,
             'P32': 26,
             'P120': 27,
             'P58': 7,
             'P133': 9,
             'P7': 11,
             'P197': 12,
             'P195': 41,
             'P93': 9,
             'P138': 22,
             'P114': 9,
             'P92': 24,
             'P14': 9,
             'P148': 6,
             'P196': 11,
             'P98': 9,
             'P188': 12,
             'P4': 15,
             'P139': 20,
             'P178': 16,
             'P86': 5,
             'P24': 12,
             'P

In [42]:
images_names = []
source_folder_path = (
    "/home/lujun/local/DLMI-Classification/data/post-processed/testset_flattened"
)
for person_folder in os.listdir(source_folder_path):
    images_names.append(person_folder[:-4])

df = pd.DataFrame({"Image_Name": images_names, "Prediction": all_predictions})

In [42]:
df[["Person", "Number"]] = df["Image_Name"].str.split("_", 1, expand=True)

In [46]:
grouped_df = (
    df.groupby("Person")["Prediction"]
    .agg(["mean", "max", "min", "median"])
    .reset_index()
)

In [47]:
grouped_df

Unnamed: 0,Person,mean,max,min,median
0,P108,0.804301,1.0,0.5,1.0
1,P114,0.82644,1.0,0.5,0.999999
2,P119,0.793394,1.0,0.5,1.0
3,P120,0.799222,1.0,0.5,1.0
4,P132,0.82687,1.0,0.5,1.0
5,P133,0.851475,1.0,0.5,1.0
6,P138,0.751155,1.0,0.5,0.779365
7,P139,0.841659,1.0,0.5,1.0
8,P14,0.770479,1.0,0.5,0.956484
9,P143,0.826129,1.0,0.5,1.0


In [93]:
test_dataset

<__main__.DLMICustomDataset at 0x7f7853364220>

Pretraining the resnet model

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import YourDataset  # 自定义数据集类，需根据实际情况修改

# 定义预处理转换
preprocess = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

# 加载数据集
dataset = YourDataset(
    root="path/to/your/dataset", transform=preprocess
)  # 修改为你的数据集路径
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 加载预训练的ResNet模型
resnet = models.resnet18(pretrained=True)

# 冻结参数，只更新最后一层
for param in resnet.parameters():
    param.requires_grad = False

# 替换最后一层（全连接层）为自定义的嵌入层
num_features = resnet.fc.in_features
embedding_size = 128  # 嵌入向量的维度
resnet.fc = nn.Linear(num_features, embedding_size)

# 定义损失函数和优化器
criterion = nn.TripletMarginLoss()  # 使用三元组损失
optimizer = torch.optim.Adam(resnet.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in dataloader:
        images = images.to(device)
        labels = labels.to(device)

        # 正向传播
        embeddings = resnet(images)

        # 计算损失
        loss = criterion(embeddings, labels)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(dataset)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}")

# 保存模型
torch.save(resnet.state_dict(), "resnet18_embedding_model.pth")