### Анализ рентгеновских снимков по картинкам с прогнозированием заболеваний

#### [Ссылка на датасет](https://huggingface.co/datasets/Sohaibsoussi/NIH-Chest-X-ray-dataset-small)

```
class_label:
  '0': No Finding
  '1': Atelectasis
  '2': Cardiomegaly
  '3': Effusion
  '4': Infiltration
  '5': Mass
  '6': Nodule
  '7': Pneumonia
  '8': Pneumothorax
  '9': Consolidation
  '10': Edema
  '11': Emphysema
  '12': Fibrosis
  '13': Pleural_Thickening
  '14': Hernia
```

In [26]:
# !source venv/bin/activate  
# !pip install -r requirements.txt 

In [2]:
import os
import shutil
import pandas as pd

from PIL import Image
from tqdm import tqdm
from datetime import datetime

from torchvision import models

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms.v2 as tfs_v2

In [3]:
PROJECT_DIR_PATH = os.path.abspath(os.path.curdir)
DATASET_DIR_PATH = os.path.join(PROJECT_DIR_PATH, 'dataset')

In [4]:
class XRayDataset(data.Dataset):
    def __init__(self, path: str, type_file: str, transform=None):
        self.path = path
        self.type_file = type_file
        self.transform = transform

        self.length = 0
        # self.files = []
        # self.targets = []
        self.classes = torch.eye(15)

        not_split_data = self.get_unpack_img()
        files, targets, _length = self.split_data(not_split_data)

        self.df = pd.DataFrame({'path': files, 'target': targets})
        self.length = self.df.shape[0]

    def __getitem__(self, index: int):
        file_path, target = self.df.iloc[index]
        img = Image.open(file_path).convert('RGB')
        
        if (self.transform is not None):
            img = self.transform(img)
        
        return img, self.classes[target]

    def __len__(self):
        return self.length

    def get_unpack_img(self) -> list:
        type_file_path = os.path.join(self.path, self.type_file)

        if os.path.exists(type_file_path):
            shutil.rmtree(type_file_path)

        os.makedirs(type_file_path, exist_ok=True)

        parquet_files = [f for f in os.listdir(self.path) if f.startswith(
            self.type_file) and f.endswith('.parquet')]

        result = []

        for file_name in tqdm(parquet_files, desc=f"Unpacking {self.type_file}"):
            df = pd.read_parquet(os.path.join(self.path, file_name))

            for idx, row in df.iterrows():
                img_path = os.path.join(type_file_path, f"image_{idx}.png")

                if isinstance(row['image']['bytes'], bytes):
                    with open(img_path, 'wb') as f:
                        f.write(row['image']['bytes'])
                # else:
                #     img = Image.fromarray(row['image'])
                #     img.save(img_path)

                result.append((img_path, row['labels']))

        return result

    def split_data(self, not_split_data: list):
        files = []
        targets = []
        length = 0

        for path, labels in not_split_data:
            for target in labels.tolist():
                files.append(path)
                targets.append(int(target))
                length += 1

        return files, targets, length

    def cut_dataframe(self, target: str | int, count: int) -> None:
        rows_to_drop = self.df[self.df['target'] == target].index[:count]
        self.df = self.df.drop(rows_to_drop)
        self.length = self.df.shape[0]

In [30]:
d_test = XRayDataset(DATASET_DIR_PATH, 'test')
d_train = XRayDataset(DATASET_DIR_PATH, 'train')

Unpacking test: 100%|██████████| 2/2 [00:01<00:00,  1.68it/s]
Unpacking train: 100%|██████████| 4/4 [00:03<00:00,  1.24it/s]


In [31]:
print('TRAIN_DATASET')
print(d_train.df.info(), end='\n\n')
print(d_train.df.target.value_counts())

TRAIN_DATASET
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5260 entries, 0 to 5259
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    5260 non-null   object
 1   target  5260 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 82.3+ KB
None

target
0     2489
4      685
3      459
1      431
6      243
5      194
9      165
8      144
13     108
2       77
12      75
11      71
10      61
7       43
14      15
Name: count, dtype: int64


In [32]:
print('TEST_DATASET')
print(d_test.df.info(), end='\n\n')
print(d_test.df.target.value_counts())

TEST_DATASET
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1828 entries, 0 to 1827
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    1828 non-null   object
 1   target  1828 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 28.7+ KB
None

target
0     541
4     284
3     213
1     160
8     146
5      96
9      77
6      73
11     60
13     55
2      42
10     31
12     26
7      23
14      1
Name: count, dtype: int64


In [33]:
d_train.cut_dataframe(0, 1300)
d_train.df.target.value_counts()

target
0     1189
4      685
3      459
1      431
6      243
5      194
9      165
8      144
13     108
2       77
12      75
11      71
10      61
7       43
14      15
Name: count, dtype: int64

In [34]:
transform = tfs_v2.Compose([
    tfs_v2.ToTensor(),
    tfs_v2.ToDtype(dtype=torch.float32, scale=True),
    tfs_v2.Normalize(mean=[0.485, 0.456, 0.406],
                     std=[0.229, 0.224, 0.225])
])

d_train.transform = transform
train_data = data.DataLoader(d_train, batch_size=4, shuffle=True)

model = models.vgg16(weights='IMAGENET1K_V1')

for param in model.features.parameters():
    param.requires_grad = False

model.classifier = nn.Sequential(
    nn.Linear(25088, 4096), # 512*7*7 = 25088
    nn.ReLU(),
    nn.Linear(4096, 15) 
)

optimizer = optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)
# loss_function = nn.BCEWithLogitsLoss()
loss_function = nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



In [None]:
modal_start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
os.makedirs(os.path.join(PROJECT_DIR_PATH, 'models',
            'vgg16', modal_start_time), exist_ok=True)

epochs = 5

for _e in range(epochs):
    loss_mean = 0  # среднее значение функции потерь (по эпохе)
    lm_count = 0  # текущее количество слагаемых

    train_tqdm = tqdm(train_data, leave=True)

    for x_train, y_train in train_tqdm:
        x_train = x_train.to(device)
        y_train = y_train.to(device)

        y_pred = model(x_train)
        loss = loss_function(y_pred, y_train)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        lm_count += 1
        loss_mean = 1/lm_count * loss.item() + (1 - 1/lm_count) * loss_mean
        train_tqdm.set_description(
            f"Epoch [{_e+1}/{epochs}], loss_mean={loss_mean:.3f}")

    model_state_dict = {
        'tfs': transform.state_dict(),
        'opt': optimizer.state_dict(),
        'model': model.state_dict(),
    }

    torch.save(model_state_dict,
               f"./models/vgg16/{modal_start_time}/epoch_{_e + 1}.tar")

```py
# weights_only=True означает, что выполняется загрузка примитивных типов данных, например: словарей, тензоров, списков, строк и т.п. 
model_data = torch.load('model_name.tar', weights_only=True)
model.load_state_dict(model_data['model'])
transforms.load_state_dict(model_data['tfs'])
optimizer.load_state_dict(model_data['opt'])

```

In [5]:
d_valid = XRayDataset(DATASET_DIR_PATH, 'validation')

Unpacking validation: 100%|██████████| 2/2 [00:00<00:00,  2.27it/s]


In [17]:
print(d_valid.df.info(), end="\n\n")
print(d_valid.df.target.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    2079 non-null   object
 1   target  2079 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 32.6+ KB
None

target
0     995
4     292
1     177
3     168
6      87
5      72
9      59
8      52
13     50
2      30
12     30
11     26
10     18
7      16
14      7
Name: count, dtype: int64


In [6]:
transform = tfs_v2.Compose([
    tfs_v2.ToTensor(),
    tfs_v2.ToDtype(dtype=torch.float32, scale=True),
    tfs_v2.Normalize(mean=[0.485, 0.456, 0.406],
                     std=[0.229, 0.224, 0.225])
])

d_valid.transform = transform
valid_data = data.DataLoader(dataset=d_valid, batch_size=64, shuffle=False)


model = models.vgg16(weights='IMAGENET1K_V1')

model.classifier = nn.Sequential(
    nn.Linear(25088, 4096),  # 512*7*7 = 25088
    nn.ReLU(),
    nn.Linear(4096, 15)
)

optimizer = optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)
# loss_function = nn.BCEWithLogitsLoss()
loss_function = nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



In [None]:
model_data = torch.load('./models/vgg16/2025-05-06 06:41:01/epoch_4.tar', map_location=device, weights_only=True)

model.load_state_dict(model_data['model'])
optimizer.load_state_dict(model_data['opt'])
transform.load_state_dict(model_data['tfs'])

<All keys matched successfully>

In [None]:
model.eval()
with torch.no_grad():
    valid_tqdm = tqdm(valid_data, leave=True)
    
    for x_valid, y_valid in valid_tqdm:
        x_valid = x_valid.to(device)
        y_valid = y_valid.to(device)
        

        y_pred = model(x_valid)
        
        print(y_valid[0])
        print(y_pred[0])
        
        break
    

		
    

  0%|          | 0/33 [00:00<?, ?it/s]