### Анализ рентгеновских снимков по картинкам с прогнозированием заболеваний

#### [Ссылка на датасет](https://huggingface.co/datasets/Sohaibsoussi/NIH-Chest-X-ray-dataset-small)

```
class_label:
  '0': No Finding
  '1': Atelectasis
  '2': Cardiomegaly
  '3': Effusion
  '4': Infiltration
  '5': Mass
  '6': Nodule
  '7': Pneumonia
  '8': Pneumothorax
  '9': Consolidation
  '10': Edema
  '11': Emphysema
  '12': Fibrosis
  '13': Pleural_Thickening
  '14': Hernia
```

In [337]:
import os
import shutil

import pandas as pd

from PIL import Image

import torch
import torch.nn as nn
import torch.utils.data as data
import torchvision.transforms.v2 as tfs_v2

In [338]:
PROJECT_DIR_PATH = os.path.abspath(os.path.curdir)
DATASET_DIR_PATH = os.path.join(PROJECT_DIR_PATH, 'dataset')

In [339]:
class XRayDataset(data.Dataset):
    def __init__(self, path: str, type_file: str = True, transform=None):
        self.path = path
        self.type_file = type_file
        self.transform = transform

        self.length = 0
        self.files = []
        self.targets = []
        
        not_split_data = self.get_unpack_img()
        self.files, self.targets, self.length = self.split_data(not_split_data)
        
    
    def __getitem__(self, index):
            file_path = self.files[index]
            target = self.targets[index]
            
            return file_path, target

    def __len__(self):
        return self.length
        
    def get_unpack_img(self) -> list:
        type_file_path = os.path.join(self.path, self.type_file)

        if os.path.exists(type_file_path):
            shutil.rmtree(type_file_path)

        os.makedirs(type_file_path, exist_ok=True)

        parquet_files = [f for f in os.listdir(self.path) if f.startswith(self.type_file) and f.endswith('.parquet')]

        result = []

        for file_name in parquet_files:
            df = pd.read_parquet(os.path.join(self.path, file_name))
            
            for idx, row in df.iterrows():
                img_path = os.path.join(type_file_path, f"image_{idx}.png")
                
                if isinstance(row['image']['bytes'], bytes):
                    with open(img_path, 'wb') as f:
                        f.write(row['image']['bytes'])
                # else:
                #     img = Image.fromarray(row['image'])
                #     img.save(img_path)
                
                result.append((img_path, row['labels']))
                
        return result

    def split_data(self, not_split_data):
        files = []
        targets = []
        length = 0
        
        for path, labels in not_split_data:
            for target in labels.tolist():
                files.append(path)
                targets.append(target)
                length += 1
        
        return files, targets, length


In [340]:
d_tarin = XRayDataset(DATASET_DIR_PATH, 'train')
d_test = XRayDataset(DATASET_DIR_PATH, 'test')
d_valid = XRayDataset(DATASET_DIR_PATH, 'validation')

In [353]:
print(d_tarin)

<__main__.XRayDataset object at 0x17fb75e80>


In [342]:
# def unpack_img(type_file) -> None:
#     type_file_path = os.path.join(DATASET_DIR_PATH, type_file)
    
#     if os.path.exists(type_file_path):
#         shutil.rmtree(type_file_path)
        
#     os.makedirs(type_file_path, exist_ok=True)

#     parquet_files = [f for f in os.listdir(DATASET_DIR_PATH) if f.startswith(type_file) and f.endswith('.parquet')]
    
#     for file_name in parquet_files:
#         df = pd.read_parquet(os.path.join(DATASET_DIR_PATH, file_name))
        
#         for idx, row in df.iterrows():
#             img_path = os.path.join(type_file_path, f"image_{idx}.png")
            
#             if isinstance(row['image']['bytes'], bytes):
#                 with open(img_path, 'wb') as f:
#                     f.write(row['image']['bytes'])
#             # else:
#             #     img = Image.fromarray(row['image'])
#             #     img.save(img_path)
            
#             # paths.append(img_path)
#             # classes.append(row['class'])

In [343]:
# if unpack:
#     unpack_img('test')
#     unpack_img('train')

# print(len(os.listdir(os.path.join(DATASET_DIR_PATH, 'train'))))

In [344]:
df = pd.read_parquet(os.path.join(DATASET_DIR_PATH, DATASET_TEST_FILES[0]))
df.head(5)

Unnamed: 0,image,labels
0,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,[0]
1,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,[1]
2,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,[1]
3,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,[0]
4,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,[0]


In [345]:
import numpy as np
python_list = np.array([1, 2, 3], dtype=np.int64)

python_list.tolist()

[1, 2, 3]