In [8]:
import os
import cv2
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import SimpleITK as sitk
from pathlib import Path
from pprint import pprint

import torch
import torch.nn.functional as F
import torchvision.transforms.functional as TF
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset, DataLoader

from torch.optim import Adam
from transformers import get_cosine_schedule_with_warmup

import matplotlib.pyplot as plt
import seaborn as sns
from pycm import ConfusionMatrix


from sklearn.metrics import classification_report



random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f4a2309f450>

In [18]:
from models.cls_models import make_efficientnet_featurizer

# Data

In [32]:
base_data_path = os.path.join(Path(os.getcwd()).parent, 'DICOM_xy_position', 'Nii_files')
df_path = os.path.join(base_data_path, 'slices.xlsx')
df = pd.read_excel(df_path)
df = df[df.Name != "KURDUMOV.nii"] # broken file
df = df.reset_index()
df['Name'] = df.Name.apply(lambda x: os.path.join(base_data_path, x))
df

Unnamed: 0,index,Name,Begin,End1vertebra,End2vertebra,Sex,Age,x_center,y_center,Strange Scale Data
0,0,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,56,4,NONE,m,26,273,293,0
1,1,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,94,54,14,f,18,261,313,0
2,2,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,157,116,75,f,25,259,346,0
3,3,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,328,284,240,m,30,250,285,0
4,4,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,291,248,209,m,35,231,327,0
...,...,...,...,...,...,...,...,...,...,...
218,219,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,71,29,NONE,f,37,260,310,0
219,220,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,90,44,NONE,NONE,45,261,332,0
220,221,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,91,49,6,NONE,39,259,263,0
221,222,/media/dmitriy/main/data/DICOM_Sechenov/DICOM_...,85,41,6,f,42,260,278,0


In [30]:
model = make_efficientnet_featurizer()
norm = torch.nn.InstanceNorm2d(1)

Loaded pretrained weights for efficientnet-b0


In [33]:
dataset = []

for i in tqdm(range(df.shape[0]), position=0, leave=True):
    row = df.loc[i]
    image = sitk.GetArrayFromImage(sitk.ReadImage(row.Name)).astype(np.float32)
    down_line = -1 if isinstance(row['End1vertebra '], str) else row['End1vertebra ']
    
    for slide_i, slide in enumerate(image):
        with torch.no_grad():
            emb = model(norm(torch.tensor(slide).unsqueeze(0).unsqueeze(0)).cuda()).cpu()[0].numpy()
            dataset.append([row.Name, slide_i, emb, (1 if down_line<i and i<row['Begin'] else 0) ])


100%|██████████| 223/223 [11:17<00:00,  3.04s/it]


In [34]:
import pickle

with open(file='catboost_dataset.pkl', mode='wb') as f:
    pickle.dump(dataset, f)

# Train

In [36]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier

In [37]:
from sklearn.model_selection import train_test_split
train_ds, test_ds = train_test_split(dataset, test_size=0.2, random_state=42)

In [62]:
train_x = [b[2] for b in train_ds]
train_y = np.array([b[3] for b in train_ds])
train_df = pd.DataFrame(data=list(zip(train_x, train_y)), columns=['emb', 'target'])
train_x = np.array([v for v in train_df['emb'].values])

test_x = [b[2] for b in test_ds]
test_y = np.array([b[3] for b in test_ds])
test_df = pd.DataFrame(data=list(zip(test_x, test_y)), columns=['emb', 'target'])
test_x = np.array([v for v in test_df['emb'].values])

In [63]:
ros = RandomOverSampler()
_, _ = ros.fit_resample(train_x, train_y)
train_idx = ros.sample_indices_
_, _ = ros.fit_resample(test_x, test_y)
test_idx = ros.sample_indices_

x_train, x_test = train_x[train_idx], test_x[test_idx]
y_train, y_test = train_y[train_idx], test_y[test_idx]

In [80]:
clsf = CatBoostClassifier(iterations=100, verbose=False)
clsf.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f496b152ca0>

In [81]:
from sklearn.metrics import classification_report

train_pred = clsf.predict(x_train)
pprint(classification_report(y_train, train_pred))

('              precision    recall  f1-score   support\n'
 '\n'
 '           0       0.80      0.77      0.78     38613\n'
 '           1       0.78      0.81      0.79     38613\n'
 '\n'
 '    accuracy                           0.79     77226\n'
 '   macro avg       0.79      0.79      0.79     77226\n'
 'weighted avg       0.79      0.79      0.79     77226\n')


In [82]:
from sklearn.metrics import classification_report

test_pred = clsf.predict(x_test)
pprint(classification_report(y_test, test_pred))

('              precision    recall  f1-score   support\n'
 '\n'
 '           0       0.51      0.67      0.58      9656\n'
 '           1       0.51      0.34      0.41      9656\n'
 '\n'
 '    accuracy                           0.51     19312\n'
 '   macro avg       0.51      0.51      0.49     19312\n'
 'weighted avg       0.51      0.51      0.49     19312\n')
