# Pipeline

* Changes on Vanya pipeline: https://github.com/Amlaith/medical_diseases_recognition/blob/main/checkpoint_3_baseline/model.ipynb

    * переделка на многоклассовость

    * смена модели

* Для демонстрации используются меньшие значения N, N_COMP...

In [7]:
import os
from pathlib import Path
import zipfile
import gdown
import pydicom
import cv2
import numpy as np
import pandas as pd
from skimage.filters import sobel, rank
from skimage.feature import hog
from skimage import exposure
from skimage.morphology import disk
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from numpy.linalg import svd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, recall_score, f1_score, fbeta_score, roc_auc_score
import pickle

## Download data

In [8]:
# DATASET_PARENT_LOCATION = Path('.')  # UNCOMMENT WHEN IN COLAB
DATASET_PARENT_LOCATION = Path('..')  # UNCOMMENT WHEN IN REPO
DATASET_NAME = 'rsna-pneumonia-dataset'
DATASET_LOCATION = DATASET_PARENT_LOCATION / DATASET_NAME
GOOGLE_FILE_ID = '1nIW5qgn4MurehHDiulrTMHNQMpsu4SRJ'
ZIP_FILE_NAME = 'rsna-pneumonia-detection-challenge.zip'
wrong_age_entries   = [
    '3b8b8777-a1f6-4384-872a-28b95f59bf0d',
    'f632328d-5819-4b29-b54f-adf4934bbee6',
    '73aeea88-fc48-4030-8564-0a9d7fdecac4',
    'ec3697bd-184e-44ba-9688-ff8d5fbf9bbc',
    'a4e8e96d-93a6-4251-b617-91382e610fab',
    ]  # На этапе EDA у этих пациетнтов обнаружились ошибки в значении возраста
SEED = 74
np.random.seed(SEED)

In [42]:
def init(SIZE=128, N=200):
  if DATASET_NAME not in os.listdir(DATASET_PARENT_LOCATION):
    if ZIP_FILE_NAME not in os.listdir(DATASET_PARENT_LOCATION):
      print('Downloading the dataset')
      gdown.download(f'https://drive.google.com/uc?id={GOOGLE_FILE_ID}', ZIP_FILE_NAME, quiet=True)

    os.mkdir(DATASET_LOCATION)
    with zipfile.ZipFile(ZIP_FILE_NAME, 'r') as zip_ref:
      print('Extracting from .zip')
      zip_ref.extractall(DATASET_LOCATION)
    os.remove(ZIP_FILE_NAME)

  labels = pd.read_csv(DATASET_LOCATION / 'merged_df_v1.csv', usecols=['patient_id', 'enc_class'])

  labels = labels.drop_duplicates()  # Удаляем дубликаты (у пациентов с несколькими ббоксами)
  labels = labels[~np.isin(labels['patient_id'], wrong_age_entries)]  # Удаляем пациентов с неправильным значением возраста
  # labels = labels.groupby('Target')[['patientId', 'Target']].apply(lambda x: x.sample(N // 2, random_state=SEED)).reset_index(drop=True).sample(N)  # Выбираем n случайных пациентов

  classes = labels['enc_class'].unique()
  samples_per_class = round(N / len(classes))

  res = pd.DataFrame()
  for cls in classes:
      sub_sample = labels[labels['enc_class'] == cls].sample(
            n=samples_per_class, 
            random_state=SEED, 
            ignore_index=True  # the resulting index will be labeled 0, 1, … n-1  
        )
      res = pd.concat([res, sub_sample])

  labels = res


  images = []
  print('Extracting images from DICOM')
  for id in labels['patient_id']:
    dcm = pydicom.dcmread(DATASET_LOCATION / 'stage_2_train_images' / (id + '.dcm'))
    images.append(cv2.resize(dcm.pixel_array, (SIZE, SIZE)))

  images = np.array(images)
  labels = np.array(labels['enc_class'])

  return images, labels

In [46]:
SIZE = 128  # Размер изображения SIZE x SIZE
N = 2000  # Количество используемых изображений
images, labels = init(N=N, SIZE=SIZE)

(f'Изображения: {images.shape}, метки классов: {labels.shape}')

Extracting images from DICOM


'Изображения: (2001, 128, 128), метки классов: (2001,)'

In [45]:
np.unique(labels)

array([0, 1, 2])

In [47]:
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=SEED)
X_train.shape

(1600, 128, 128)

In [48]:
N_COMP = 32
pca = PCA(n_components=N_COMP).fit(X_train.reshape(X_train.shape[0], -1))
pca

In [49]:
def preprocess(image_array):
    image_array = image_array / 255.
    features = []
    for img in range(image_array.shape[0]):
        cur_image = image_array[img, :, :]
        img_features = np.array([])

        footprint = disk(30)
        img_features = np.concatenate((img_features, rank.equalize(cur_image, footprint=footprint).reshape(-1)))

        features.append(img_features)

    features_array = np.array(features)
    pca_features = pca.transform(image_array.reshape(image_array.shape[0], -1))

    return np.concatenate((features_array, pca_features), axis=1)

In [50]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)

X_train.shape, X_test.shape

  X_train = preprocess(X_train)
  X_test = preprocess(X_test)


((1600, 16416), (401, 16416))

In [51]:
# pip install lightgbm

In [52]:
from lightgbm import LGBMClassifier

In [70]:
LightGBM = LGBMClassifier(learning_rate=0.05, max_depth=8, n_estimators=150, random_state=SEED, verbose=-1)
LightGBM.fit(X_train, y_train)

y_pred = LightGBM.predict(X_test)

In [71]:
from sklearn.metrics import balanced_accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from typing import Tuple

def get_metrics(y_pred, y_true, rounding=8, verbouse=True, returns=False) -> Tuple:
    """Print/Return all metrics by predictions and true values""" 

    if returns == True:
        verbouse = False

    acc = np.round(balanced_accuracy_score(y_pred, y_true), rounding)
    f1 = np.round(f1_score(y_pred, y_true, average="weighted"), rounding)
    
    y_true_binarized = label_binarize(y_true, classes=[0, 1, 2])
    y_pred_binarized = label_binarize(y_pred, classes=[0, 1, 2])

    # Расчет ROC-AUC для многоклассовой задачи
    auc_score = roc_auc_score(y_true_binarized, y_pred_binarized,
                            multi_class='ovr', average=None)

    # тут в том числе взвешивание по частоте классов
    roc_auc_metric = pd.DataFrame(labels).value_counts(sort=False, normalize=True) @ auc_score[::-1]
    roc_auc_metric = np.round(roc_auc_metric, rounding)
    gini = 2 * roc_auc_metric - 1
    gini = np.round(gini, rounding)

    if verbouse:
        print(f"Balanced accuracy: {acc}")
        print(f"F1-score: {f1}")
        print(f"roc_auc_score: {roc_auc_metric}")
        print(f"gini_score:{gini}")

    if returns:
        return acc, f1, roc_auc_metric, gini

In [72]:
get_metrics(y_pred, y_test)

Balanced accuracy: 0.58051411
F1-score: 0.60663085
roc_auc_score: 0.69663434
gini_score:0.39326868


In [73]:
with open('model.pkl','wb') as f:
    pickle.dump(LightGBM, f)

with open('pca.pkl','wb') as f:
    pickle.dump(pca, f)