In [1]:
import pandas as pd
import os
from imutils import paths
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
import radiomics
from radiomics import featureextractor  # This module is used for interaction with pyradiomics
from PIL import Image
import SimpleITK as sitk
import cv2
from collections import Counter
import random
import itertools


In [2]:
def color_extraction(img):
    b, g, r = cv2.split(img)
    mean_b, std_b = cv2.meanStdDev(b)
    mean_g, std_g = cv2.meanStdDev(g)
    mean_r, std_r = cv2.meanStdDev(r)
    return {"mean_b": mean_b, "std_b": std_b, "mean_g": mean_g, "std_g": std_g, "mean_r": mean_r, "std_r": std_r,}
    

In [3]:
DATASET_PATH = "../Datasets/ham1000-segmentation-and-classification"
path = "../Datasets/new_data"
CSV_FILE_PATH = os.path.join(DATASET_PATH, "GroundTruth.csv")
IMAGE_DATASET_PATH = os.path.join(path, "images/")
MASK_DATASET_PATH = os.path.join(path, "masks/")
params = '../Datasets/Selected.yaml'


In [4]:
imagePaths = sorted(list(paths.list_images(IMAGE_DATASET_PATH)))
maskPaths = sorted(list(paths.list_images(MASK_DATASET_PATH)))
extractor = featureextractor.RadiomicsFeatureExtractor(params)


In [5]:
results = []
features = {}

for i in range(len(imagePaths)):
    path_original = imagePaths[i]
    path_label = maskPaths[i]
    original = cv2.imread(path_original)
    label = cv2.imread(path_label, 0)
    
    # Expand dimensions of mask array
    y = np.expand_dims(label, axis=2)
    newmask = np.concatenate((y, y, y), axis=2)
    
    # Multiply new mask with image
    cob = original * newmask
    
    dictionary = color_extraction(cob)
    original = cv2.cvtColor(original, cv2.COLOR_RGB2GRAY)
    original = Image.fromarray(original, "L")
    original.save("original.png")
    
    label = Image.fromarray(label, "L")
    label.save("label.png")
    result = extractor.execute("original.png", path_label)
    
    result['original_mean_b'] = dictionary['mean_b']
    result['original_std_b'] = dictionary['std_b']
    result['original_mean_g'] = dictionary['mean_g']
    result['original_std_g'] = dictionary['std_g']
    result['original_mean_r'] = dictionary['mean_r']
    result['original_std_r'] = dictionary['std_r']
    results.append(result)
    
    if i % 1000 == 0:
        print(i)
    
feature_names = list(sorted(filter ( lambda k: k.startswith("original_"), results[0] )))


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [6]:
lesion_dict = {
    "MEL": 0, 
    "NV": 1,
    "BCC": 2,
    "AKIEC": 3,
    "BKL": 4,
    "DF": 5,
    "VASC": 6
}

df = pd.read_csv(CSV_FILE_PATH)
categories = []
for index, row in df.iterrows():
    if row.eq(1).any():
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        

In [7]:
samples = np.zeros((len(imagePaths),len(feature_names)))
for case_id in range(0 ,len(imagePaths)):
    a = np.array([])
    for feature_name in feature_names:
        a = np.append(a, results[case_id][feature_name])
    samples[case_id,:] = a
    
# May have NaNs
samples = np.nan_to_num(samples)


In [8]:
feature_names


['original_firstorder_90Percentile',
 'original_firstorder_Mean',
 'original_firstorder_Minimum',
 'original_glcm_Idmn',
 'original_glrlm_RunEntropy',
 'original_glszm_LargeAreaLowGrayLevelEmphasis',
 'original_glszm_SizeZoneNonUniformityNormalized',
 'original_mean_b',
 'original_mean_g',
 'original_mean_r',
 'original_shape2D_Elongation',
 'original_shape2D_Sphericity',
 'original_std_b',
 'original_std_g',
 'original_std_r']

In [9]:
len(categories)


10013

In [10]:
Counter(categories)


Counter({1: 6705, 0: 1112, 4: 1098, 5: 115, 3: 327, 2: 514, 6: 142})

In [11]:
new_lesion_dict = {
    0: [], 
    1: [],
    2: [],
    3: [],
    4: [],
    5: [],
    6: []
}

for i in range(len(categories)):
    new_lesion_dict[categories[i]].append(samples[i])
        

In [12]:
len(new_lesion_dict[1])

6705

In [13]:
X =[]
y = []
#for category in new_lesion_dict:
    #print(len(new_lesion_dict[category]))
    #for i in range(len(new_lesion_dict[category])):
        #X.append(new_lesion_dict[category][i])
        #y.append(category)
X =[]
y = []
for category in new_lesion_dict:
    for i in range(690):
        X.append(new_lesion_dict[category][i])
        y.append(category)
        
    
temp = list(zip(X, y))
random.shuffle(temp)
res1, res2 = zip(*temp)
# res1 and res2 come out as tuples, and so must be converted to lists.
X, y = list(res1), list(res2)


1112
6705
514
327
1098
115
142


In [14]:
df = pd.DataFrame()


In [15]:
column_names = feature_names.copy()


In [16]:
column_names.append('categories')


In [17]:
for col in column_names:
    df[col] = []
    

In [18]:
df


Unnamed: 0,original_firstorder_90Percentile,original_firstorder_Mean,original_firstorder_Minimum,original_glcm_Idmn,original_glrlm_RunEntropy,original_glszm_LargeAreaLowGrayLevelEmphasis,original_glszm_SizeZoneNonUniformityNormalized,original_mean_b,original_mean_g,original_mean_r,original_shape2D_Elongation,original_shape2D_Sphericity,original_std_b,original_std_g,original_std_r,categories


In [19]:
list_of_arrays = X.copy()
single_list = y.copy()


In [20]:
len(list_of_arrays[0])


15

In [21]:
for i in range(len(list_of_arrays)):
    if i < len(single_list):
        list_of_arrays[i] = np.append(list_of_arrays[i], single_list[i])


In [22]:
len(list_of_arrays[0])


16

In [23]:
for array in list_of_arrays:
    df = df.append(pd.DataFrame([array], columns=df.columns), ignore_index=True)
    


  df = df.append(pd.DataFrame([array], columns=df.columns), ignore_index=True)


In [24]:
df


Unnamed: 0,original_firstorder_90Percentile,original_firstorder_Mean,original_firstorder_Minimum,original_glcm_Idmn,original_glrlm_RunEntropy,original_glszm_LargeAreaLowGrayLevelEmphasis,original_glszm_SizeZoneNonUniformityNormalized,original_mean_b,original_mean_g,original_mean_r,original_shape2D_Elongation,original_shape2D_Sphericity,original_std_b,original_std_g,original_std_r,categories
0,148.0,99.254402,39.0,0.998102,6.515257,210230.762877,0.247313,30.140707,31.670589,19.905767,0.737379,0.891482,64.322458,66.687304,42.119899,1.0
1,169.0,129.821464,46.0,0.996646,5.668175,46729.427985,0.188598,81.407378,82.996719,46.094541,0.833797,0.785082,69.759356,68.239332,38.502182,4.0
2,146.0,119.164401,6.0,0.998160,5.931788,42756.840432,0.170025,59.306811,58.282089,37.539581,0.705322,0.881772,73.008919,71.310602,46.689367,4.0
3,146.0,91.100573,4.0,0.988422,5.048103,914.436754,0.278598,62.227933,62.854141,28.881063,0.855238,0.908769,87.420585,87.465697,46.377623,1.0
4,140.0,96.717312,40.0,0.995057,5.972747,67457.563352,0.153656,24.606481,24.866707,9.157274,0.958171,0.909431,61.029002,61.595477,24.156567,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10008,166.0,137.063923,76.0,0.996143,6.398813,295116.983208,0.303367,36.924407,33.732044,12.358352,0.867718,0.881203,61.864557,56.710330,22.003950,1.0
10009,160.0,127.688868,49.0,0.998259,6.887382,299867.977296,0.137681,74.559774,89.956778,49.707326,0.607779,0.757822,61.733552,71.514754,40.302658,2.0
10010,175.0,162.460995,90.0,0.997280,5.775538,166538.871918,0.214932,16.509848,15.387678,6.015959,0.710822,0.902451,38.901595,36.021534,14.122041,2.0
10011,120.0,65.306072,15.0,0.997522,6.513638,546518.311433,0.181826,67.531122,67.076107,46.509222,0.387973,0.790726,96.695735,95.435751,66.496983,1.0


In [25]:
print(df.dtypes)


original_firstorder_90Percentile                  float64
original_firstorder_Mean                          float64
original_firstorder_Minimum                       float64
original_glcm_Idmn                                float64
original_glrlm_RunEntropy                         float64
original_glszm_LargeAreaLowGrayLevelEmphasis      float64
original_glszm_SizeZoneNonUniformityNormalized    float64
original_mean_b                                   float64
original_mean_g                                   float64
original_mean_r                                   float64
original_shape2D_Elongation                       float64
original_shape2D_Sphericity                       float64
original_std_b                                    float64
original_std_g                                    float64
original_std_r                                    float64
categories                                        float64
dtype: object


In [26]:
df['categories'] = df['categories'].astype(int)


In [27]:
print(df.dtypes)


original_firstorder_90Percentile                  float64
original_firstorder_Mean                          float64
original_firstorder_Minimum                       float64
original_glcm_Idmn                                float64
original_glrlm_RunEntropy                         float64
original_glszm_LargeAreaLowGrayLevelEmphasis      float64
original_glszm_SizeZoneNonUniformityNormalized    float64
original_mean_b                                   float64
original_mean_g                                   float64
original_mean_r                                   float64
original_shape2D_Elongation                       float64
original_shape2D_Sphericity                       float64
original_std_b                                    float64
original_std_g                                    float64
original_std_r                                    float64
categories                                          int64
dtype: object


In [28]:
df.to_csv('../Datasets/selectedAlll.csv')
