In [1]:
import pandas as pd
import os
from imutils import paths
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
import radiomics
from radiomics import featureextractor  # This module is used for interaction with pyradiomics
from PIL import Image
import SimpleITK as sitk
import cv2
from collections import Counter
import random
import itertools


In [2]:
def color_extraction(img):
    b, g, r = cv2.split(img)
    mean_b, std_b = cv2.meanStdDev(b)
    mean_g, std_g = cv2.meanStdDev(g)
    mean_r, std_r = cv2.meanStdDev(r)
    return {"mean_b": mean_b, "std_b": std_b, "mean_g": mean_g, "std_g": std_g, "mean_r": mean_r, "std_r": std_r,}
    

In [3]:
DATASET_PATH = "../Datasets/ham1000-segmentation-and-classification"
path = "../Datasets/new_data"
CSV_FILE_PATH = os.path.join(DATASET_PATH, "GroundTruth.csv")
IMAGE_DATASET_PATH = os.path.join(path, "images/")
MASK_DATASET_PATH = os.path.join(path, "masks/")
params = '../Datasets/Selected.yaml'


In [4]:
imagePaths = sorted(list(paths.list_images(IMAGE_DATASET_PATH)))
maskPaths = sorted(list(paths.list_images(MASK_DATASET_PATH)))
extractor = featureextractor.RadiomicsFeatureExtractor(params)


In [5]:
results = []
features = {}

for i in range(len(imagePaths)):
    path_original = imagePaths[i]
    path_label = maskPaths[i]
    original = cv2.imread(path_original)
    label = cv2.imread(path_label, 0)
    
    # Expand dimensions of mask array
    y = np.expand_dims(label, axis=2)
    newmask = np.concatenate((y, y, y), axis=2)
    
    # Multiply new mask with image
    cob = original * newmask
    
    dictionary = color_extraction(cob)
    original = cv2.cvtColor(original, cv2.COLOR_RGB2GRAY)
    original = Image.fromarray(original, "L")
    original.save("original.png")
    
    label = Image.fromarray(label, "L")
    label.save("label.png")
    result = extractor.execute("original.png", path_label)
    
    result['original_mean_b'] = dictionary['mean_b']
    result['original_std_b'] = dictionary['std_b']
    result['original_mean_g'] = dictionary['mean_g']
    result['original_std_g'] = dictionary['std_g']
    result['original_mean_r'] = dictionary['mean_r']
    result['original_std_r'] = dictionary['std_r']
    results.append(result)
    
    if i % 1000 == 0:
        print(i)
    
feature_names = list(sorted(filter ( lambda k: k.startswith("original_"), results[0] )))


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000


In [6]:
lesion_dict = {
    "MEL": 0, 
    "NV": 1,
    "BCC": 2,
    "AKIEC": 3,
    "BKL": 4,
    "DF": 5,
    "VASC": 6
}

df = pd.read_csv(CSV_FILE_PATH)
categories = []
for index, row in df.iterrows():
    if row.eq(1).any():
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        categories.append(lesion_dict[row[row.eq(1)].index[0]])
        

In [7]:
samples = np.zeros((len(imagePaths),len(feature_names)))
for case_id in range(0 ,len(imagePaths)):
    a = np.array([])
    for feature_name in feature_names:
        a = np.append(a, results[case_id][feature_name])
    samples[case_id,:] = a
    
# May have NaNs
samples = np.nan_to_num(samples)


In [8]:
feature_names


['original_firstorder_90Percentile',
 'original_firstorder_Minimum',
 'original_glcm_Idmn',
 'original_gldm_SmallDependenceLowGrayLevelEmphasis',
 'original_glrlm_RunEntropy',
 'original_mean_b',
 'original_mean_g',
 'original_mean_r',
 'original_shape2D_Elongation',
 'original_shape2D_MaximumDiameter',
 'original_shape2D_Sphericity',
 'original_std_b',
 'original_std_g',
 'original_std_r']

In [9]:
len(categories)


60072

In [10]:
Counter(categories)


Counter({1: 40224, 0: 6672, 4: 6588, 5: 690, 3: 1962, 2: 3084, 6: 852})

In [11]:
new_lesion_dict = {
    0: [], 
    1: [],
    2: [],
    3: [],
    4: [],
    5: [],
    6: []
}

for i in range(len(categories)):
    new_lesion_dict[categories[i]].append(samples[i])
        

In [12]:
len(new_lesion_dict[1])

40224

In [13]:
X =[]
y = []
#for category in new_lesion_dict:
    #print(len(new_lesion_dict[category]))
    #for i in range(len(new_lesion_dict[category])):
        #X.append(new_lesion_dict[category][i])
        #y.append(category)
X =[]
y = []
for category in new_lesion_dict:
    for i in range(690):
        X.append(new_lesion_dict[category][i])
        y.append(category)
        
    
temp = list(zip(X, y))
random.shuffle(temp)
res1, res2 = zip(*temp)
# res1 and res2 come out as tuples, and so must be converted to lists.
X, y = list(res1), list(res2)


In [14]:
df = pd.DataFrame()


In [15]:
column_names = feature_names.copy()


In [16]:
column_names.append('categories')


In [17]:
for col in column_names:
    df[col] = []
    

In [18]:
df


Unnamed: 0,original_firstorder_90Percentile,original_firstorder_Minimum,original_glcm_Idmn,original_gldm_SmallDependenceLowGrayLevelEmphasis,original_glrlm_RunEntropy,original_mean_b,original_mean_g,original_mean_r,original_shape2D_Elongation,original_shape2D_MaximumDiameter,original_shape2D_Sphericity,original_std_b,original_std_g,original_std_r,categories


In [19]:
list_of_arrays = X.copy()
single_list = y.copy()


In [20]:
len(list_of_arrays[0])


14

In [21]:
for i in range(len(list_of_arrays)):
    if i < len(single_list):
        list_of_arrays[i] = np.append(list_of_arrays[i], single_list[i])


In [22]:
len(list_of_arrays[0])


15

In [23]:
for array in list_of_arrays:
    df = df.append(pd.DataFrame([array], columns=df.columns), ignore_index=True)
    


  df = df.append(pd.DataFrame([array], columns=df.columns), ignore_index=True)


In [24]:
df


Unnamed: 0,original_firstorder_90Percentile,original_firstorder_Minimum,original_glcm_Idmn,original_gldm_SmallDependenceLowGrayLevelEmphasis,original_glrlm_RunEntropy,original_mean_b,original_mean_g,original_mean_r,original_shape2D_Elongation,original_shape2D_MaximumDiameter,original_shape2D_Sphericity,original_std_b,original_std_g,original_std_r,categories
0,161.0,80.0,0.992064,0.005657,5.105083,3.347089,3.296307,1.598907,0.650120,118.067777,0.908828,20.480355,20.097959,9.772399,5.0
1,157.0,35.0,0.998601,0.000782,5.905211,46.529911,50.140863,38.869526,0.819743,484.648326,0.773553,54.464262,58.478562,45.320396,2.0
2,118.0,26.0,0.993128,0.003812,5.299070,17.860337,17.587300,6.862952,0.876956,213.775583,0.844228,52.982297,52.184568,20.791194,1.0
3,133.0,63.0,0.995780,0.004300,5.024385,37.157711,41.487370,21.078696,0.710968,370.303929,0.915339,61.014331,68.256350,34.937524,5.0
4,201.0,22.0,0.997075,0.001532,5.804363,49.121596,52.063385,28.804052,0.761081,550.019091,0.773354,56.408512,58.344577,40.215310,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4825,159.0,10.0,0.996447,0.002040,5.893693,28.175667,31.313352,22.366433,0.728082,342.327329,0.834432,53.957457,59.120496,42.866630,4.0
4826,144.0,42.0,0.997637,0.002324,6.852165,21.316111,22.361819,10.057922,0.536641,295.711008,0.793414,55.612097,57.986434,27.376454,1.0
4827,117.0,37.0,0.996760,0.004918,6.341468,25.865059,27.284896,12.193059,0.905291,255.519080,0.845017,63.444335,66.962507,31.261957,1.0
4828,168.0,18.0,0.997982,0.001595,6.226654,9.701993,10.688470,7.127826,0.825248,189.789357,0.896521,34.182972,37.205766,25.137743,5.0


In [25]:
print(df.dtypes)


original_firstorder_90Percentile                     float64
original_firstorder_Minimum                          float64
original_glcm_Idmn                                   float64
original_gldm_SmallDependenceLowGrayLevelEmphasis    float64
original_glrlm_RunEntropy                            float64
original_mean_b                                      float64
original_mean_g                                      float64
original_mean_r                                      float64
original_shape2D_Elongation                          float64
original_shape2D_MaximumDiameter                     float64
original_shape2D_Sphericity                          float64
original_std_b                                       float64
original_std_g                                       float64
original_std_r                                       float64
categories                                           float64
dtype: object


In [26]:
df['categories'] = df['categories'].astype(int)


In [27]:
print(df.dtypes)


original_firstorder_90Percentile                     float64
original_firstorder_Minimum                          float64
original_glcm_Idmn                                   float64
original_gldm_SmallDependenceLowGrayLevelEmphasis    float64
original_glrlm_RunEntropy                            float64
original_mean_b                                      float64
original_mean_g                                      float64
original_mean_r                                      float64
original_shape2D_Elongation                          float64
original_shape2D_MaximumDiameter                     float64
original_shape2D_Sphericity                          float64
original_std_b                                       float64
original_std_g                                       float64
original_std_r                                       float64
categories                                             int64
dtype: object


In [28]:
df.to_csv('../Datasets/selectedAlll.csv')
