Each label column contains one of four values: 1.0, -1.0, 0.0, or missing. These labels have the following interpretation:

- 1.0 - The label was positively mentioned in the associated study, and is present in one or more of the corresponding images
e.g. "A large pleural effusion"   
- 0.0 - The label was negatively mentioned in the associated study, and therefore should not be present in any of the corresponding images
e.g. "No pneumothorax."   
- -1.0 - The label was either: 
  (1) mentioned with uncertainty in the report, and therefore may or may not be present to some degree in the corresponding image, or (2) mentioned with ambiguous language in the report and it is unclear if the pathology exists or not    
  Explicit uncertainty: "The cardiac size cannot be evaluated."  
  Ambiguous language: "The cardiac contours are stable."  
- Missing (empty element) - No mention of the label was made in the report
---
In this project, I using   
`1` represent `positive`; (same with the original category indicator)   
`0` represent `negative`; (same with the original category indicator)  
`0` represent `Nan`; Reasonably assume that `Missing` value in the original table indicates the absence of certain of disease, in this case, `Nan` is replace by `0`;  
`uncertainty` is more complicate to preprocess, and there are multiple strategies:  
- binary: categorize the `uncertainty` into no-postive case, in this it would be represented with 0
- binary_2: reference the strategies used in the paper [[1]](https://arxiv.org/pdf/1901.07031.pdf) and [[2]](https://arxiv.org/pdf/2211.14929)
  - `Atelectasis` and `Edema`: U-ones
  - `Cardiomegaly`: multi-class
  - *`rest`*: U-zeros
  - `ignore`: U-ignore, ignore the uncertainty cases and training with mask binary cross entropy
  - > Strategy_1 : `Atelectasis`, `Edema`: U-ones; and the `rest`: U-zeros.
  - > Strategy_2 : U-ignore, ignore the uncertainty cases
  
- multiple-classes: in this case, `uncertainty` will be viewed as a independent indicator, and would be represented with `2` 
  - -1 is not applicable for cross-entropy in python
 

##### import package

In [69]:
import pandas as pd
from collections import defaultdict
import clip
import torch
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
import numpy as np
from typing import Any, Dict, Optional, Tuple, Union
import open_clip
import copy
from torchvision.transforms import InterpolationMode
import os
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torch.nn as nn

##### load original dateset

In [136]:
split_data = pd.read_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/mimic-cxr-2.0.0-split.csv")
original_label_data = pd.read_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/mimic-cxr-2.0.0-chexpert.csv")
original_meta_data = pd.read_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/mimic-cxr-2.0.0-metadata.csv")

In [137]:
original_label_data[original_label_data['study_id']==58235663]

split_data[split_data['study_id']==58235663]

Unnamed: 0,dicom_id,study_id,subject_id,split
58215,1a671a62-0a32dfc6-5f85029c-81c3922e-3f5a2c27,58235663,11573679,train


##### extract 14 labels

In [161]:
def extract_14_label_4_each_record(original_df = None):
  # for index, row in original_df.iterrows():
    label_dic = {}
    for column_name, column_data in original_df.items():
      if column_name in ["subject_id", 'study_id', "original_14_labels", "strategy1_14_labels"]:
        continue
      label_dic[column_name] = 0 if pd.isnull(column_data) else column_data
    return label_dic
  
original_label_data["original_14_labels"] = original_label_data.apply(extract_14_label_4_each_record, axis=1)

def extract_14_label_4_each_record_with_strategy_1(each_row):
    label_dic = {}
    for column_name, column_data in each_row.items():
      if column_name in ["subject_id", 'study_id', "original_14_labels", "strategy1_14_labels"]:
        continue
      if column_name in ['Atelectasis', 'Edema'] and column_data == -1:
        label_dic[column_name] = 1
      elif column_name not in ['Atelectasis', 'Edema'] and column_data == -1:
        label_dic[column_name] = 0
      else:
        label_dic[column_name] = 0 if pd.isnull(column_data) else column_data
    return label_dic
  
original_label_data["strategy1_14_labels"] = original_label_data.apply(extract_14_label_4_each_record_with_strategy_1, axis=1)

In [162]:
col_index = ['subject_id', 'study_id', 'original_14_labels',  'strategy1_14_labels']
process_data = original_label_data[col_index]
process_data

def get_original_14_labels_vector(row):
  keys = row['original_14_labels'].keys()
  values = row['original_14_labels'].values()
  return values

def get_strategy1_14_labels_vector(row):
  values = row['strategy1_14_labels'].values()
  return values
  
process_data.loc[:,'original_14_labels'] = process_data.apply(get_original_14_labels_vector, axis=1)
process_data.loc[:,'strategy1_14_labels'] = process_data.apply(get_strategy1_14_labels_vector, axis=1)
process_data.head(1)

Unnamed: 0,subject_id,study_id,original_14_labels,strategy1_14_labels
0,10000032,50414267,"(0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0)","(0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0)"


In [163]:
## Utility functions
def convert_dict_value2value(dict_values):
  return list(dict_values)

def get_project_3_class_labels(original_labels):
  if -1 in original_labels:
    converted_labels = [2 if i == -1  else i for i in original_labels]
    return converted_labels
  return original_labels

In [164]:
process_data.loc[:, "original_14_labels"] = process_data["original_14_labels"].apply(convert_dict_value2value)
process_data.loc[:, "strategy1_14_labels"] = process_data["strategy1_14_labels"].apply(convert_dict_value2value)


In [167]:
process_data.loc[:,"project_3_classes_14_labels"] = process_data["original_14_labels"].copy().apply(get_project_3_class_labels)

In [169]:
process_data.to_csv('/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/process_data.csv', index=False)  # index=False 表示不保存行索引
process_data

Unnamed: 0,subject_id,study_id,original_14_labels,strategy1_14_labels,project_3_classes_14_labels
0,10000032,50414267,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
1,10000032,53189527,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
2,10000032,53911762,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
3,10000032,56699142,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
4,10000764,57375967,"[0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, -1.0, 0, 0]","[0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0]"
...,...,...,...,...,...
227822,19999442,58708861,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 1.0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 1.0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 1.0]"
227823,19999733,57132437,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
227824,19999987,55368167,"[1.0, -1.0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, ...","[1.0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, 0.0...","[1.0, 2, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, 0.0..."
227825,19999987,58621812,"[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]","[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]","[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]"


In [170]:
process_data = pd.read_csv('/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/process_data.csv',)
process_data

Unnamed: 0,subject_id,study_id,original_14_labels,strategy1_14_labels,project_3_classes_14_labels
0,10000032,50414267,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
1,10000032,53189527,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
2,10000032,53911762,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
3,10000032,56699142,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
4,10000764,57375967,"[0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, -1.0, 0, 0]","[0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0]"
...,...,...,...,...,...
227822,19999442,58708861,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 1.0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 1.0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 1.0]"
227823,19999733,57132437,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
227824,19999987,55368167,"[1.0, -1.0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, ...","[1.0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, 0.0...","[1.0, 2, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, 0.0..."
227825,19999987,58621812,"[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]","[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]","[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]"


##### add split data indicator

split数据集中的数据量要比原始的original data要多，但是两个dataset中的study-id数量是一致的。  
同时在split数据集中不存在同一个sid用于不同的目的（train，test，validate）  
split数据的增多理解为study-id在该数据表格中的重复更多(一个study，一个label，多个views)    
在训练中多个view的图片有一个label。每张图片的label在process data中检索获得


---
构造 `program_data_set` 保存最终项目使用的数据

In [172]:
program_data_set = split_data.copy()
program_data_set.loc[:, "original_14_labels"] = None
program_data_set.loc[:, "strategy1_14_labels"] = None
program_data_set.loc[:, "ViewPosition"] = None
program_data_set.head(1)

Unnamed: 0,dicom_id,study_id,subject_id,split,original_14_labels,strategy1_14_labels,ViewPosition
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,,,


In [173]:
dictionary = process_data.set_index('study_id').to_dict(orient='index')

##### add labels

In [178]:
# add label
except_sid_original = []
except_sid_original_strategy1 = []

def search_project_3_label_in_process_and_fill_split_data(row):
    study_id = row.study_id

    if study_id not in dictionary:
      except_sid_original.append(study_id)
      return
      
    original_14_labels = dictionary[study_id]["project_3_classes_14_labels"]
    return original_14_labels

def search_original_label_in_process_and_fill_split_data(row):
    study_id = row.study_id

    if study_id not in dictionary:
      except_sid_original.append(study_id)
      return
      
    original_14_labels = dictionary[study_id]["original_14_labels"]
    return original_14_labels
  
def search_Strategy1_label_in_process_and_fill_split_data(row):
    study_id = row.study_id
    if study_id not in dictionary:
      except_sid_original_strategy1.append(study_id)
      return
    strategy1_14_labels = dictionary[study_id]["strategy1_14_labels"]
    return strategy1_14_labels
  
program_data_set["project_3_classes_14_labels"] = program_data_set.apply(search_project_3_label_in_process_and_fill_split_data, axis=1)
program_data_set["strategy1_14_labels"] = program_data_set.apply(search_Strategy1_label_in_process_and_fill_split_data, axis=1)
program_data_set["original_14_labels"] = program_data_set.apply(search_original_label_in_process_and_fill_split_data, axis=1)

In [180]:
condition = program_data_set['study_id'].isin(except_sid_original)  # 例如，删除满足 A 列大于 3 的行
program_data_set = program_data_set[~condition]

In [181]:
program_data_set

Unnamed: 0,dicom_id,study_id,subject_id,split,original_14_labels,strategy1_14_labels,ViewPosition,project_3_classes_14_labels
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,57132437,19999733,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,57132437,19999733,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,55368167,19999987,train,"[1.0, -1.0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, ...","[1.0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, 0.0...",,"[1.0, 2, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0, 0, 0.0..."
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,58621812,19999987,train,"[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]","[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]",,"[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0]"


In [182]:
program_data_set.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/program_data_set_3_8.csv", index=False)  # index=False 表示不保存行索引

##### add view position

In [183]:
# add view position
meta_dict = original_meta_data.set_index('dicom_id').to_dict(orient = "index")

In [184]:
all_view = []
for index, row in program_data_set.iterrows():
  dicom_id = row.dicom_id
  view = meta_dict[dicom_id]['ViewPosition']
  all_view.append(view)

In [185]:
program_data_set.loc[:, "ViewPosition"] = all_view
program_data_set.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/program_data_set_3_8.csv", index=False)  # index=False 表示不保存行索引

In [186]:
program_data_set.head()

Unnamed: 0,dicom_id,study_id,subject_id,split,original_14_labels,strategy1_14_labels,ViewPosition,project_3_classes_14_labels
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",PA,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",LATERAL,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",PA,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",LATERAL,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",AP,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]"


##### add image_tensor_path

In [188]:
basic = "/public_bme/data/lds/"

def get_image_file_path(row):
    p = "p" + str(row.subject_id)[:2]
    pp = 'p' + str(row.subject_id)
    s = "s" + str(row.study_id)
    img = row.dicom_id + ".jpg"
    file_path = f"{basic}/{p}/{pp}/{s}/{img}"
    return file_path

# 获取所有文件路径
file_paths = [get_image_file_path(row) for _, row in program_data_set.iterrows()]
num = len(file_paths)
interval = num // 100
# 检查所有文件路径是否存在
for i , file_path in enumerate(file_paths):
    if i % interval == 0:
      print(f"{i} / {interval} = {i/interval}")
    if not os.path.exists(file_path):
      raise RuntimeError(f"{file_path} does not exist.")

print("pass")

0 / 3770 = 0.0
3770 / 3770 = 1.0
7540 / 3770 = 2.0
11310 / 3770 = 3.0
15080 / 3770 = 4.0
18850 / 3770 = 5.0
22620 / 3770 = 6.0
26390 / 3770 = 7.0
30160 / 3770 = 8.0
33930 / 3770 = 9.0
37700 / 3770 = 10.0
41470 / 3770 = 11.0
45240 / 3770 = 12.0
49010 / 3770 = 13.0
52780 / 3770 = 14.0
56550 / 3770 = 15.0
60320 / 3770 = 16.0
64090 / 3770 = 17.0
67860 / 3770 = 18.0
71630 / 3770 = 19.0
75400 / 3770 = 20.0
79170 / 3770 = 21.0
82940 / 3770 = 22.0
86710 / 3770 = 23.0
90480 / 3770 = 24.0
94250 / 3770 = 25.0
98020 / 3770 = 26.0
101790 / 3770 = 27.0
105560 / 3770 = 28.0
109330 / 3770 = 29.0
113100 / 3770 = 30.0
116870 / 3770 = 31.0
120640 / 3770 = 32.0
124410 / 3770 = 33.0
128180 / 3770 = 34.0
131950 / 3770 = 35.0
135720 / 3770 = 36.0
139490 / 3770 = 37.0
143260 / 3770 = 38.0
147030 / 3770 = 39.0
150800 / 3770 = 40.0
154570 / 3770 = 41.0
158340 / 3770 = 42.0
162110 / 3770 = 43.0
165880 / 3770 = 44.0
169650 / 3770 = 45.0
173420 / 3770 = 46.0
177190 / 3770 = 47.0
180960 / 3770 = 48.0
184730 / 3770 

RuntimeError: /public_bme/data/lds//p18/p18726783/s53552031/d9df9050-058ec148-dbbc6042-617a1880-aab30d33.jpg does not exist.

In [189]:
BiomedClip_img_tensor_paths = [(lambda x: x.replace(".jpg", "_BioMedClip.pth"))(path) for path in file_paths]
Clip_img_tensor_path = [(lambda x: x.replace(".jpg", "_Clip.pth"))(path) for path in file_paths]

In [190]:
program_data_set.loc[:,"image_file_path"] = file_paths
program_data_set.loc[:,"BiomedClip_img_tensor_path"] = BiomedClip_img_tensor_paths
program_data_set.loc[:,"Clip_img_tensor_path"] = Clip_img_tensor_path

program_data_set.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/program_data_set_3_8.csv", index=False)  # index=False 表示不保存行索引

In [195]:
program_data_set = pd.read_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/program_data_set_3_8.csv")
program_data_set.head(1)

Unnamed: 0,dicom_id,study_id,subject_id,split,original_14_labels,strategy1_14_labels,ViewPosition,project_3_classes_14_labels,image_file_path,BiomedClip_img_tensor_path,Clip_img_tensor_path
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",PA,"[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]",/public_bme/data/lds//p10/p10000032/s50414267/...,/public_bme/data/lds//p10/p10000032/s50414267/...,/public_bme/data/lds//p10/p10000032/s50414267/...


In [201]:
program_data_set[program_data_set["dicom_id"] == "d9df9050-058ec148-dbbc6042-617a1880-aab30d33"]["image_file_path"].values
os.path.exists("/public_bme/data/lds//p18/p18726783/s53552031/d9df9050-058ec148-dbbc6042-617a1880-aab30d33.jpg")


False

In [215]:
index = program_data_set[program_data_set["dicom_id"] == "d9df9050-058ec148-dbbc6042-617a1880-aab30d33"].index.item()
program_data_set_completed = program_data_set.loc[:index-1, :]

In [218]:
program_data_set_completed.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/program_data_set_complete_3_14.csv", index=False)  # index=False 表示不保存行索引

In [219]:
program_data_set_completed.split.unique()

array(['train', 'validate', 'test'], dtype=object)

## image preprocessing

In [2]:
# image preprocess logics -- BiomedClip & CLIP
try:
    BICUBIC = InterpolationMode.BICUBIC
except ImportError:
    BICUBIC = Image.BICUBIC

def _convert_image_to_rgb(image):
    return image.convert("RGB")

def _transform(n_px):
    return Compose([
        Resize(n_px, interpolation=BICUBIC),
        CenterCrop(n_px),
        _convert_image_to_rgb,
        ToTensor(),
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

def CLIP_Process(image_path, dest):
    img = Image.open(image_path)
    a = 224
    b = _transform(a)
    c = b(img)
    if ((dest.split(".")[-1]) != "pth"):
      dest+=".pth"
      
    torch.save(c, dest)
    return c

OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)

_FIELDS = '__dataclass_fields__'
def _is_dataclass_instance(obj):
    """Returns True if obj is an instance of a dataclass."""
    return hasattr(type(obj), _FIELDS)

def asdict(obj, *, dict_factory=dict):
    """Return the fields of a dataclass instance as a new dictionary mapping
    field names to field values.

    Example usage:

      @dataclass
      class C:
          x: int
          y: int

      c = C(1, 2)
      assert asdict(c) == {'x': 1, 'y': 2}

    If given, 'dict_factory' will be used instead of built-in dict.
    The function applies recursively to field values that are
    dataclass instances. This will also look into built-in containers:
    tuples, lists, and dicts.
    """
    if not _is_dataclass_instance(obj):
        raise TypeError("asdict() should be called on dataclass instances")
    return _asdict_inner(obj, dict_factory)

def _asdict_inner(obj, dict_factory):
    if _is_dataclass_instance(obj):
        result = []
        for f in fields(obj):
            value = _asdict_inner(getattr(obj, f.name), dict_factory)
            result.append((f.name, value))
        return dict_factory(result)
    elif isinstance(obj, tuple) and hasattr(obj, '_fields'):
        return type(obj)(*[_asdict_inner(v, dict_factory) for v in obj])
    elif isinstance(obj, (list, tuple)):
        # Assume we can create an object of this type by passing in a
        # generator (which is not true for namedtuples, handled
        # above).
        return type(obj)(_asdict_inner(v, dict_factory) for v in obj)
    elif isinstance(obj, dict):
        return type(obj)((_asdict_inner(k, dict_factory),
                          _asdict_inner(v, dict_factory))
                         for k, v in obj.items())
    else:
        return copy.deepcopy(obj)

class AugmentationCfg:
    scale: Tuple[float, float] = (0.9, 1.0)
    ratio: Optional[Tuple[float, float]] = None
    color_jitter: Optional[Union[float, Tuple[float, float, float]]] = None
    interpolation: Optional[str] = None
    re_prob: Optional[float] = None
    re_count: Optional[int] = None
    use_timm: bool = False

class ResizeMaxSize(nn.Module):
    def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
        super().__init__()
        if not isinstance(max_size, int):
            raise TypeError(f"Size should be int. Got {type(max_size)}")
        self.max_size = max_size
        self.interpolation = interpolation
        self.fn = min if fn == 'min' else min
        self.fill = fill

    def forward(self, img):
        if isinstance(img, torch.Tensor):
            height, width = img.shape[:2]
        else:
            width, height = img.size
        scale = self.max_size / float(max(height, width))
        new_size = tuple(round(dim * scale) for dim in (height, width))
        if scale != 1.0:
            img = F.resize(img, new_size, self.interpolation)
        if not width == height:
            pad_h = self.max_size - new_size[0]
            pad_w = self.max_size - new_size[1]
            img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
        return img

def image_transform(
        image_size: int,
        is_train:bool = False,
        mean: Optional[Tuple[float, ...]] = None,
        std: Optional[Tuple[float, ...]] = None,
        resize_longest_max: bool = False,
        fill_color: int = 0,
        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
):
    mean = mean or OPENAI_DATASET_MEAN
    if not isinstance(mean, (list, tuple)):
        mean = (mean,) * 3

    std = std or OPENAI_DATASET_STD
    if not isinstance(std, (list, tuple)):
        std = (std,) * 3

    if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
        # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
        image_size = image_size[0]

    if isinstance(aug_cfg, dict):
        aug_cfg = AugmentationCfg(**aug_cfg)
    else:
        aug_cfg = aug_cfg or AugmentationCfg()
    normalize = Normalize(mean=mean, std=std)
    if is_train:
        raise NotImplemented("!!LDS!!")
    else:
        if resize_longest_max:
            transforms = [
                ResizeMaxSize(image_size, fill=fill_color)
            ]
        else:
            transforms = [
                Resize(image_size, interpolation=InterpolationMode.BICUBIC),
                CenterCrop(image_size),
            ]
        transforms.extend([
            _convert_image_to_rgb,
            ToTensor(),
            normalize,
        ])
        return Compose(transforms)

def BiomedCLIP_processor(image_path, dest):
    img = Image.open(image_path)
    preprocess_val = image_transform(224)
    data = preprocess_val(img)
    if ((dest.split(".")[-1]) != "pth"):
      dest+=".pth"
      
    torch.save(data, dest)
    return data


In [5]:
# generate .pth (CLIP and BiomedCLIP)
data = program_data_set
img_paths = data.image_file_path
BiomedClip_tensor_paths = data.BiomedClip_img_tensor_path
total = len(BiomedClip_tensor_paths)
print(total)
print(len(BiomedClip_tensor_paths), len(img_paths))
dev = total // 10
count = 0

for (img_path, tensor_path) in (zip(img_paths, BiomedClip_tensor_paths)):
  try:
    # print( type(img_path), img_path, type(ten,,l[pl-0o-or_path), tensor_path)
    BiomedCLIP_processor(img_path, tensor_path)
    if count%dev == 0:
      print(count/dev)
      print(img_path, tensor_path)
    count+=1
  except Exception as e:
    print(e)
  

377095
377095 377095
0.0
/public_bme/data/lds//p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg /public_bme/data/lds//p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014_BioMedClip.pth


KeyboardInterrupt: 

In [128]:
# generate .pth (CLIP and BiomedCLIP)
data = program_data_set
img_paths = data.image_file_path
BiomedClip_tensor_paths = data.BiomedClip_img_tensor_path
total = len(BiomedClip_tensor_paths)
print(total)
print(len(BiomedClip_tensor_paths), len(img_paths))
dev = total // 200
count = 0

for (img_path, tensor_path) in (zip(img_paths, BiomedClip_tensor_paths)):
  try:
    # print( type(img_path), img_path, type(ten,,l[pl-0o-or_path), tensor_path)
    BiomedCLIP_processor(img_path, tensor_path)
    if count%dev == 0:
      print(count/dev)
      print(img_path, tensor_path)
    count+=1
  except Exception as e:
    print(e)
  

377095
377095 377095
0.0
/public_bme/data/lds//p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg /public_bme/data/lds//p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014_BioMedClip.pth
1.0
/public_bme/data/lds//p10/p10047766/s55277368/835292d6-81c8046d-9a4de11c-62113dbc-9cc52b8c.jpg /public_bme/data/lds//p10/p10047766/s55277368/835292d6-81c8046d-9a4de11c-62113dbc-9cc52b8c_BioMedClip.pth
2.0
/public_bme/data/lds//p10/p10108435/s59903303/a504a681-8f865141-31abcfe8-4d268f82-ae526b89.jpg /public_bme/data/lds//p10/p10108435/s59903303/a504a681-8f865141-31abcfe8-4d268f82-ae526b89_BioMedClip.pth
3.0
/public_bme/data/lds//p10/p10160202/s58703965/d06ddd1b-4cb00b27-b9ad632d-9b727ee2-42c1d698.jpg /public_bme/data/lds//p10/p10160202/s58703965/d06ddd1b-4cb00b27-b9ad632d-9b727ee2-42c1d698_BioMedClip.pth
4.0
/public_bme/data/lds//p10/p10213338/s51687394/5f413890-5702f098-15f2f445-60527dad-e54c878c.jpg /public_bme/data/lds//p10/p10213338/s51687394/5f413890-5702f

In [None]:
Clip_img_tensor_paths = data.Clip_img_tensor_path
total = len(Clip_img_tensor_paths)
print(Clip_img_tensor_paths)
print(len(Clip_img_tensor_paths), len(img_paths))
dev = total // 10
count = 0


for  (img_path, tensor_path) in (zip(img_paths, Clip_img_tensor_paths)):
  try:
    # print( type(img_path), img_path, type(ten,,l[pl-0o-or_path), tensor_path)
    CLIP_Process(img_path, tensor_path)
    if count%dev == 0:
      print(count/dev)
      print(img_path, tensor_path)
    count+=1
  except Exception as e:
    print(e)

In [220]:
test_data = program_data_set_completed[program_data_set_completed['split'] == "test"]
validate_data = program_data_set_completed[program_data_set_completed['split'] == "validate"]
train_data = program_data_set_completed[program_data_set_completed['split'] == "train"]

print(len(test_data))
print(len(validate_data))
print(len(train_data))

4357
2646
322118


**preprocess testing data for biomedclip**

In [13]:
# generate .pth (CLIP and BiomedCLIP)
data = test_data
img_paths = data.image_file_path
BiomedClip_tensor_paths = data.BiomedClip_img_tensor_path
total = len(BiomedClip_tensor_paths)
print(total)
print(len(BiomedClip_tensor_paths), len(img_paths))
dev = total // 10
count = 0

for (img_path, tensor_path) in (zip(img_paths, BiomedClip_tensor_paths)):
  try:
    # print( type(img_path), img_path, type(ten,,l[pl-0o-or_path), tensor_path)
    BiomedCLIP_processor(img_path, tensor_path)
    if count%dev == 0:
      print(count/dev)
      print(img_path, tensor_path)
    count+=1
  except Exception as e:
    print(e)
  

5159
5159 5159
0.0
/public_bme/data/lds//p10/p10032725/s50331901/687754ce-7420bfd3-0a19911f-a27a3916-9019cd53.jpg /public_bme/data/lds//p10/p10032725/s50331901/687754ce-7420bfd3-0a19911f-a27a3916-9019cd53_BioMedClip.pth
1.0
/public_bme/data/lds//p11/p11413236/s51503417/2d291461-7354f6b1-b797f9c5-5c58ef2f-a516fa93.jpg /public_bme/data/lds//p11/p11413236/s51503417/2d291461-7354f6b1-b797f9c5-5c58ef2f-a516fa93_BioMedClip.pth
2.0
/public_bme/data/lds//p12/p12699874/s51280998/f46ebce4-270dbbd9-24602b65-695b054c-bcd8093c.jpg /public_bme/data/lds//p12/p12699874/s51280998/f46ebce4-270dbbd9-24602b65-695b054c-bcd8093c_BioMedClip.pth
3.0
/public_bme/data/lds//p13/p13475033/s54028344/7794e4cb-719a0b85-18532575-0b5ea119-8eb26b6a.jpg /public_bme/data/lds//p13/p13475033/s54028344/7794e4cb-719a0b85-18532575-0b5ea119-8eb26b6a_BioMedClip.pth
4.0
/public_bme/data/lds//p14/p14295224/s57630991/fdce2841-ba70c298-a83fb5a1-71e58044-dd1115a4.jpg /public_bme/data/lds//p14/p14295224/s57630991/fdce2841-ba70c298-a8

In [21]:
test_data_a = test_data[test_data['subject_id'].astype(str).str[:2] == '13']

**preprocess validation data for biomedclip**

In [24]:
# generate .pth (CLIP and BiomedCLIP)
data = validate_data
img_paths = data.image_file_path
BiomedClip_tensor_paths = data.BiomedClip_img_tensor_path
total = len(BiomedClip_tensor_paths)
print(total)
print(len(BiomedClip_tensor_paths), len(img_paths))
dev = total // 10
count = 0
error_count = 0

for (img_path, tensor_path) in (zip(img_paths, BiomedClip_tensor_paths)):
  try:
    # print( type(img_path), img_path, type(ten,,l[pl-0o-or_path), tensor_path)
    BiomedCLIP_processor(img_path, tensor_path)
    if count%dev == 0:
      print(count/dev)
      print(img_path, tensor_path)
    count+=1
  except Exception as e:
    error_count += 1
    print(e)

print(f"there are {error_count} sample does not exist")

2991
2991 2991
0.0
/public_bme/data/lds//p10/p10003502/s50084553/70d7e600-373c1311-929f5ff9-23ee3621-ff551ff9.jpg /public_bme/data/lds//p10/p10003502/s50084553/70d7e600-373c1311-929f5ff9-23ee3621-ff551ff9_BioMedClip.pth
1.0
/public_bme/data/lds//p11/p11135350/s53277637/f3a27e2d-1d0d73bc-b7394f0c-7ed82c79-189ddee5.jpg /public_bme/data/lds//p11/p11135350/s53277637/f3a27e2d-1d0d73bc-b7394f0c-7ed82c79-189ddee5_BioMedClip.pth
2.0
/public_bme/data/lds//p11/p11717909/s59882746/78ed3ced-cd79570f-e1427410-e2202da1-75dd1584.jpg /public_bme/data/lds//p11/p11717909/s59882746/78ed3ced-cd79570f-e1427410-e2202da1-75dd1584_BioMedClip.pth
3.0
/public_bme/data/lds//p12/p12669344/s51358230/7b4211fe-def2de24-c6991efa-026a3d44-2e4082f8.jpg /public_bme/data/lds//p12/p12669344/s51358230/7b4211fe-def2de24-c6991efa-026a3d44-2e4082f8_BioMedClip.pth
4.0
/public_bme/data/lds//p13/p13571108/s54496880/f418559d-05aff24a-246e401f-9575cf4e-de484f2a.jpg /public_bme/data/lds//p13/p13571108/s54496880/f418559d-05aff24a-24

## concate train_image_tensor

In [9]:
old_train.rename(columns={'Clip_img_tensor_path': 'Clip_img_tensor_path_old', 'Biomed_img_tensor_path': 'BiomedClip_img_tensor_path_old'}, inplace=True)
old_train.head()

NameError: name 'old_train' is not defined

In [10]:
train_data.loc[:, "BiomedClip_img_tensor_path_old"] = None
train_data.loc[:, "Clip_img_tensor_path_old"] = None
train_data.loc[:, "Biovil_img_tensor_path_old"] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, "BiomedClip_img_tensor_path_old"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, "Clip_img_tensor_path_old"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, "Biovil_img_tensor_path_old"] = None


In [31]:
BiomedClip_img_tensor_path_old = []
Clip_img_tensor_path_old = []
Biovil_img_tensor_path_old = []
for index, row in train_data.iterrows():
    BiomedClip_img_tensor_path = row['BiomedClip_img_tensor_path']
    prefix = BiomedClip_img_tensor_path.split("_")
    if len(prefix) > 2:
      prefix.pop()
      prefix = "_".join(prefix)
    else:
      prefix = "".join(prefix)
    BiomedClip_img_tensor_path_old.append(prefix+"_biomed.pth")
    Biovil_img_tensor_path_old.append(prefix+"_biovil.pth")
    Clip_img_tensor_path_old.append(prefix+"_clip.pth")
    

In [32]:
train_data.loc[:, "BiomedClip_img_tensor_path_old"] = BiomedClip_img_tensor_path_old
train_data.loc[:, "Clip_img_tensor_path_old"] = Clip_img_tensor_path_old
train_data.loc[:, "Biovil_img_tensor_path_old"] = Biovil_img_tensor_path_old

In [21]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 368945 entries, 0 to 377094
Data columns (total 13 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   dicom_id                        368945 non-null  object
 1   study_id                        368945 non-null  int64 
 2   subject_id                      368945 non-null  int64 
 3   split                           368945 non-null  object
 4   original_14_labels              368945 non-null  object
 5   strategy1_14_labels             368945 non-null  object
 6   ViewPosition                    353625 non-null  object
 7   image_file_path                 368945 non-null  object
 8   BiomedClip_img_tensor_path      368945 non-null  object
 9   Clip_img_tensor_path            368945 non-null  object
 10  BiomedClip_img_tensor_path_old  368945 non-null  object
 11  Clip_img_tensor_path_old        368945 non-null  object
 12  Biovil_img_tensor_path_old      368

In [33]:
train_data.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_train_data_3_11.csv", index=False)  # index=False 表示不保存行索引

In [34]:
test_data.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_test_3_11.csv", index=False)  # index=False 表示不保存行索引

In [35]:
validate_data.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_validate_3_11.csv", index=False)  # index=False 表示不保存行索引

----


### using data with sid smaller than 18000000

ensure do not get any missing files

In [58]:
test_data = pd.read_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_test_3_11.csv")
train_data = pd.read_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_train_data_3_11.csv")
validata_data = pd.read_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_validate_3_11.csv")

In [62]:
train_data.head(1)

Unnamed: 0,dicom_id,study_id,subject_id,split,original_14_labels,strategy1_14_labels,ViewPosition,image_file_path,BiomedClip_img_tensor_path,Clip_img_tensor_path,BiomedClip_img_tensor_path_old,Clip_img_tensor_path_old,Biovil_img_tensor_path_old
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,"dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...","dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...",PA,/public_bme/data/lds//p10/p10000032/s50414267/...,/public_bme/data/lds//p10/p10000032/s50414267/...,/public_bme/data/lds//p10/p10000032/s50414267/...,/public_bme/data/lds//p10/p10000032/s50414267/...,/public_bme/data/lds//p10/p10000032/s50414267/...,/public_bme/data/lds//p10/p10000032/s50414267/...


In [65]:
test_data.head()

Unnamed: 0,dicom_id,study_id,subject_id,split,original_14_labels,strategy1_14_labels,ViewPosition,image_file_path,BiomedClip_img_tensor_path,Clip_img_tensor_path
0,687754ce-7420bfd3-0a19911f-a27a3916-9019cd53,50331901,10032725,test,"dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...","dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...",,/public_bme/data/lds//p10/p10032725/s50331901/...,/public_bme/data/lds//p10/p10032725/s50331901/...,/public_bme/data/lds//p10/p10032725/s50331901/...
1,fd4126e5-c5485b35-3bbc48fc-acb448fa-fb1b42b7,55504914,10032725,test,"dict_values([0, 0.0, 0, 0, 0, 0, 0, 0, 1.0, 0,...","dict_values([0, 0.0, 0, 0, 0, 0, 0, 0, 1.0, 0,...",,/public_bme/data/lds//p10/p10032725/s55504914/...,/public_bme/data/lds//p10/p10032725/s55504914/...,/public_bme/data/lds//p10/p10032725/s55504914/...
2,427446c1-881f5cce-85191ce1-91a58ba9-0a57d3f5,50051329,10046166,test,"dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...","dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...",LATERAL,/public_bme/data/lds//p10/p10046166/s50051329/...,/public_bme/data/lds//p10/p10046166/s50051329/...,/public_bme/data/lds//p10/p10046166/s50051329/...
3,abea5eb9-b7c32823-3a14c5ca-77868030-69c83139,50051329,10046166,test,"dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...","dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...",AP,/public_bme/data/lds//p10/p10046166/s50051329/...,/public_bme/data/lds//p10/p10046166/s50051329/...,/public_bme/data/lds//p10/p10046166/s50051329/...
4,3a8a17fc-3cd357d9-83466363-91dc5a06-a401e5ed,51738740,10046166,test,"dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...","dict_values([0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0...",AP,/public_bme/data/lds//p10/p10046166/s51738740/...,/public_bme/data/lds//p10/p10046166/s51738740/...,/public_bme/data/lds//p10/p10046166/s51738740/...


In [64]:
def filter_df(df):
    # 遍历 DataFrame 的每一行
    for index, row in df.iterrows():
        bio_path = row['Clip_img_tensor_path_old']
        # 检查文件是否存在
        if not os.path.exists(bio_path):
            # 如果文件不存在，删除该行
            df.drop(index, inplace=True)
    return df
  
train_data_filer = filter_df(train_data)


KeyboardInterrupt: 

In [56]:
train_data_sub_set = train_data[train_data["subject_id"]<18000000]
test_data_sub_set = test_data[test_data["subject_id"]<18000000]
validata_data_sub_set = validata_data[validata_data["subject_id"]<18000000]

print(f"train_sub: {len(train_data_sub_set)}")
print(f"test_data_sub_set: {len(test_data_sub_set)}")
print(f"validata_data_sub_set: {len(validata_data_sub_set)}")

train_sub: 294998
test_data_sub_set: 4067
validata_data_sub_set: 2427


In [57]:
train_data_sub_set.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_train_data_sub_3_14.csv", index=False)  # index=False 表示不保存行索引
test_data.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_test_sub_3_14.csv", index=False)  # index=False 表示不保存行索引
validata_data.to_csv("/home_data/home/v-liudsh/coding/constrastive_P/diagnosisP/exchange/Fine-Grained_Features_Alignment_via_Constrastive_Learning/data/project_using_data/all_validate_sub_3_14.csv", index=False)  # index=False 表示不保存行索引

----