In [1]:
import os

import cv2 as cv
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
root_dir = ['../data/original']

In [3]:
def get_files_and_labels(list_root_dir):
    images_path = []
    labels = []
    
    list_classes = list(sorted(os.listdir(list_root_dir[0])))
    
    for root_dir in list_root_dir:
        for i, one_class in enumerate(list_classes):
            path_img = list(os.listdir(os.path.join(root_dir, one_class)))
            for path in path_img:
                path_join = os.path.join(root_dir, os.path.join(one_class, path))
                images_path.append(path_join)
                labels.append(i)
    
    df_res = pd.DataFrame()
    df_res['paths'] = images_path
    df_res['labels'] = labels
    
    return df_res

# Считывание картинок

In [4]:
df_res = get_files_and_labels(root_dir)

In [5]:
df_res

Unnamed: 0,paths,labels
0,../data/original\esophagitis\001fb927-4814-4ba...,0
1,../data/original\esophagitis\00687a70-bbad-4bf...,0
2,../data/original\esophagitis\0134d93d-0922-406...,0
3,../data/original\esophagitis\01a57b1a-780f-4e5...,0
4,../data/original\esophagitis\01f092a2-fccb-49f...,0
...,...,...
995,../data/original\normal-z-line\fcb8704e-f308-4...,1
996,../data/original\normal-z-line\fdb68e1c-a08b-4...,1
997,../data/original\normal-z-line\fde68ca4-2c9f-4...,1
998,../data/original\normal-z-line\ff257b71-d9b1-4...,1


## Удаление черных краев на картинках

In [6]:
def find_mask(img):
    """
    Удаление черных границ по маске.
    """
    # Convert Image to Image HSV
    hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)

    # Defining lower and upper bound HSV values
    lower = np.array([0,0,0])
    upper = np.array([350,20,90])

    # Defining mask for detecting color
    mask = cv.inRange(hsv, lower, upper)

    # invert mask
    mask = cv.bitwise_not(mask)
    
    return mask

In [7]:
def save_img(img, img_path, add_name_img='', name_root_dir = '..\\data_prep\\'):
    """
    Сохранение измененных картинок по оргиналу.
    Сохраняется с тем же названием в другую папку
    """
    
    name_img = img_path.split('\\')[-1].split('.')[0]+ add_name_img + '.jpg'
    name_dir = img_path.split('\\')[-2]

    path = name_root_dir + name_dir + '\\' + name_img
    cv.imwrite(path, img)

In [8]:
def del_area_behind_countour(mask_img):
    mask_img = cv.GaussianBlur(mask_img, (11,11), 10)

    ret, thresh = cv.threshold(mask_img, 127, 255, 0)
    contours, hierarchy = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)
    
    mask = np.zeros(img.shape[0:2], dtype=np.uint8)

    points = sorted(contours, key= lambda c: len(c))[-1]

    #method 1 smooth region
    cv.drawContours(mask, [points], -1, (255, 255, 255), -1, cv.LINE_AA)
    res = cv.bitwise_and(img, img, mask = mask)
    rect = cv.boundingRect(points) # returns (x,y,w,h) of the rect
    cropped = res[rect[1]: rect[1] + rect[3], rect[0]: rect[0] + rect[2]]
    
    return cropped


In [9]:
for i in range(df_res.shape[0]):
    path = df_res['paths'][i]
    img = cv.imread(path, 1)
    img_res = del_area_behind_countour(find_mask(img))
    save_img(img_res, path)

In [10]:
# # проверка на одной картинке
# path = df_res['paths'][10]
# img = cv.imread(path, 1)
# img_res = del_black_edge(del_letters(img))
# save_img(img_res, path)

# Разделение полученного датасета

In [6]:
def train_test_split_(df_img_label):
    
    X = np.array(df_img_label['paths'])
    y = np.array(df_img_label['labels'])
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.10, random_state=42)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                        test_size=0.20, random_state=42)
    
    name_columns = ['paths', 'labels']
    train = pd.DataFrame(zip(X_train, y_train), columns=name_columns)
    valid = pd.DataFrame(zip(X_valid, y_valid), columns=name_columns)
    test = pd.DataFrame(zip(X_test, y_test), columns=name_columns)
    
    return train, valid, test


def save_as_csv(train, valid, test, save_dir='../data/prepared/'):
#     save_dir = Path(ROOT, 'data', 'prepared')
    train.to_csv(save_dir + 'train.csv', index=False)
    valid.to_csv(save_dir + 'valid.csv', index=False)
    test.to_csv(save_dir + 'test.csv', index=False)

In [7]:
df_path_data_pred = get_files_and_labels(['../data_prep/'])
train, valid, test = train_test_split_(df_path_data_pred)
save_as_csv(train, valid, test)

In [12]:
print('len train ', len(train))
print('len valid ', len(valid))
print('len test ', len(test))

len train  720
len valid  180
len test  100


### Другие функции

In [None]:
def del_black_edge_one(src):
    """
    Удаляет черные границы на картинке.
    
    src: считанная картинка при помощи cv.imread()
    """
    tmp = cv.cvtColor(src, cv.COLOR_BGR2GRAY)
    # Applying thresholding technique
    _, alpha = cv.threshold(tmp, 0, 255, cv.THRESH_BINARY)

    # Using cv2.split() to split channels 
    # of coloured image
    b, g, r = cv.split(src)
   
    # Making list of Red, Green, Blue
    # Channels and alpha
    rgba = [b, g, r, alpha]

    # Using cv2.merge() to merge rgba
    # into a coloured/multi-channeled image
    dst = cv.merge(rgba, 4)
    
    return dst

In [None]:
def del_letters(img):
    """
    Удаление букв на картинке.
    Некоторые цифры и буквы могут не удалиться.
    Необходимо выполнить перед удалением черных краев.
    """
#     img_blur = cv.GaussianBlur(img, (5,5),5)
    mask = cv.threshold(img_blur, 210, 255, cv.THRESH_BINARY)[1][:,:,0]
    img_res = cv.inpaint(img_blur, mask, 8, cv.INPAINT_NS)
    
    return img_res