In [2]:
import os
import glob
import copy
import numpy as np
import pandas as pd
import cv2 as cv
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
from PIL import Image
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from skimage.feature import graycomatrix, graycoprops
from skimage.feature import local_binary_pattern
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, balanced_accuracy_score, accuracy_score
import scipy
import pickle

In [3]:
load_dotenv('test.env')
raw_dir = os.environ['RAW_DIR']
interim_dir = os.environ['INTERIM_DIR']
test_dir = os.environ['TEST_DIR']

In [4]:
def medianmad_norm(arr_like):
    '''
    Normalize by median and MAD

    Modified version from the function made available by the professor
    '''
    np_arr = np.array(arr_like)
    median = np.median(np_arr)
    dev = np.abs(np_arr - median)
    mad = np.median(dev)
    norm = ((np_arr - median) / mad)
    return (norm,mad)

def tanh_modified_norm(arr_like):
    np_arr = np.array(arr_like)

    mean = np.mean(np_arr)
    std = np.std(np_arr)

    return 127*(np.tanh(0.01*((np_arr-mean)/std))+1)

In [5]:
# img_list = glob.glob(os.path.join(raw_dir,'AVC/*.bmp')) + glob.glob(os.path.join(raw_dir,'EM/*.bmp'))
img_list = glob.glob(os.path.join(test_dir,'*.bmp'))

In [6]:
def get_mask_path(img_path, mask_suffix='_mask.png'):
    img_wo_ext = os.path.splitext(img_path)[0]
    mask_path = img_wo_ext + mask_suffix
    return mask_path

In [7]:
def save_histogram(arr_like, output_path, title='', nbins=20):
    h, bin_edges = np.histogram(arr_like, nbins)
    w = np.max(arr_like)/nbins
    bin_centers = bin_edges[1:]-(w/2)
    plt.figure(dpi=100, frameon=False)
    if len(title) > 0:
        plt.title(title)
    plt.bar(bin_centers, h, width=w)
    plt.savefig(output_path)
    plt.close()

In [8]:
def crop_zero_borders(img_arr):
    coords = np.argwhere(img_arr > 0)

    y_min, x_min = np.min(coords, axis=0)
    y_max, x_max = np.max(coords, axis=0) + 1

    return img_arr[y_min:y_max, x_min:x_max], (y_min, y_max, x_min, x_max)

In [9]:
def get_lbp_histogram(img, radius=3, method='uniform', bins = 30):
    n_points = 8 * radius
    lbp = local_binary_pattern(img, n_points, radius, method)
    hist, bins = np.histogram(lbp, bins=bins)
    return lbp, hist, bins

def get_histogram_attributes(hist, prefix=''):
    return {
        # prefix + 'Min': np.min(hist),
        prefix + 'Max': np.max(hist),
        prefix + 'Mean': np.mean(hist),
        prefix + 'Variation': scipy.stats.variation(hist),
         # prefix + 'Mode' : scipy.stats.mode(hist)[0][0],
        prefix + 'Median': np.median(hist),
        prefix + 'Skewness': scipy.stats.skew(hist),
        prefix + 'Kurtosis': scipy.stats.kurtosis(hist),
        prefix + 'Entropy': scipy.stats.entropy(hist),
    }

In [10]:
texture_list = []
for img_path in img_list:
    mask_path = get_mask_path(img_path)
    if not os.path.exists(mask_path):
        continue

    img = cv.imread(img_path, cv.IMREAD_GRAYSCALE)
    if img is None:
        continue

    mask = cv.imread(mask_path, cv.IMREAD_UNCHANGED)
    if mask is None:
        continue

    # scaler = MinMaxScaler([0,255])
    # normed = scaler.fit_transform(img).astype(np.uint8)
    normed = tanh_modified_norm(img).astype(np.uint8)

    masked = normed * (mask == mask.max())

    masked_cropped, _ = crop_zero_borders(masked)

    hist, _ = np.histogram(masked_cropped, bins=50)
    hist_attr = get_histogram_attributes(hist)

    distances = [5]
    angles = [60]
    glcm = graycomatrix(masked_cropped, distances=distances, angles=angles, levels=256, symmetric=False, normed=False)
    a = graycoprops(glcm, prop='contrast').flatten()
    contrast = a[0]

    lbp, lbp_hist, lbp_bins = get_lbp_histogram(masked_cropped)
    lbp_attr = get_histogram_attributes(lbp_hist, 'Lbp')

    label = img_path.split('/')[-2]

    patient_id = img_path.split('/')[-1].split('_')[0]

    # print('mask', masked.max(), masked.min(), 'contrast', max_contrast, max_dist, max_angle, "hist", lbp_attr)

    attributes = {
        'Contrast': contrast,
        'Label': label,
        'PatientId': patient_id,
    }
    attributes.update(hist_attr)
    attributes.update(lbp_attr)

    texture_list.append(attributes)

    # out_path = os.path.join(interim_dir, img_path.split(raw_dir)[1])
    # out_path = os.path.join(interim_dir, "hists", os.path.splitext(img_path.split(raw_dir)[1])[0] + ".png")
    
    # ok = cv.imwrite(out_path, masked)
    # print("write", img_path, out_path, ok)

    # os.makedirs(os.path.dirname(out_path), exist_ok=True)
    # save_histogram(masked[masked > 0], out_path, "Modified norm lesion histogram", nbins=50)

df = pd.DataFrame(texture_list)

In [11]:
with open('../models/1.0-estimator.pkl', 'rb') as f:
    estimator = pickle.load(f)

In [13]:
x_vars=["Entropy", "Contrast"]
X_test = df[x_vars]

In [14]:
y_test = estimator.predict(X_test)

In [32]:
out = df[['PatientId']]

In [33]:
out

Unnamed: 0,PatientId
0,150
1,151
2,152
3,153
4,154
...,...
220,393
221,394
222,395
223,397


In [34]:
out['Predicted'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out['Predicted'] = y_test


In [35]:
out

Unnamed: 0,PatientId,Predicted
0,150,AVC
1,151,EM
2,152,EM
3,153,EM
4,154,AVC
...,...,...
220,393,AVC
221,394,AVC
222,395,EM
223,397,AVC


In [36]:
out.to_csv(os.path.join(test_dir, "predicted.csv"), index=False)