In [1]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples, homogeneity_completeness_v_measure
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

In [2]:
metric_names = ['Silhouette Coefficients ', 'Average Silhouette Metric ', 'Scaled Multiplied Silhouette Metric ']
radio_classes = [0, 1, 2, 3]
radio_names = ['No Radiologist Agreement', 'Low Radiologist Agreement', 'High Radiologist Agreement', 'All Radiologists Agree']
scale = preprocessing.MinMaxScaler()

In [3]:
def cluster_centroid(embeddings, y_sc, numclusters):
    centroids = [] #center indice values
    for i in range(numclusters): #for each class...
        inds = np.where(y_sc==i)[0]#list of all images within the same proposed class
        print(inds)
        inds_embed = embeddings[inds] #feature vectors of all these images
        inds_embed1 = np.array(inds_embed) #numpy array of this matrix
        column_mean = inds_embed1.mean(axis=0) #mean per dimension (x, y, z, etc.)
        centroids.append(column_mean) #append to the list
    return centroids

In [4]:
def wss(centroids, embeddings, y_sc, numclusters, n): #in 2d - total wss (not average)
    wss_percluster = [] # list of total distance per class
    closest_inds = []
    farthest_inds = []
    for i in range(numclusters):
        inds = np.where(y_sc==i)[0] #list of all images within the same proposed class
        inds_embed = embeddings[inds] #feature vectors of all these images
        inds_embed1 = np.array(inds_embed) #numpy array of this matrix
        after_subtraction = inds_embed1 - centroids[i] #subtracting the centroid indices from the image indices
        squared_matrix = np.square(after_subtraction) #square the distance values
        sum_rows = np.sum(squared_matrix, axis = 1) #add all fo the distances of the same image 
        sum_cols = np.sum(sum_rows) #add all of image distances together of the same class 
        wss_percluster.append(sum_cols) #include this in the total distances per class
        ## find the images farthest/closest to centroid
        indclosest = np.argpartition(sum_rows, n)[:n]
        indfarthest = np.argpartition(sum_rows, -n)[-n:]
        indclosestnum = inds[indclosest]
        indfarthestnum = inds[indfarthest]
        closest_inds.append(indclosestnum)
        farthest_inds.append(indfarthestnum) #final image number
    tot_wss = np.sum(wss_percluster) #sum all of the distances from all of the clusters into 1 value 
    return tot_wss, closest_inds, farthest_inds

In [5]:
# Silhouette plot 
def silhouette_plt(y_sc, silhouette_vals, numclusters, indicator):
    cluster_labels = np.unique(y_sc)
    ax_lower, ax_upper = 0, 0
    cticks = []
    sil_plot = plt.figure()
    for i, k in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[y_sc == k]
        c_silhouette_vals.sort()
        ax_upper += len(c_silhouette_vals)
        #color = plt.jet()
        plt.barh(range(ax_lower, ax_upper), c_silhouette_vals, height=1.0, 
                         edgecolor='none', color=colors[i])
        cticks.append((ax_lower + ax_upper) / 2)
        ax_lower += len(c_silhouette_vals)
    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(silhouette_avg, color="red", linestyle="--") 
    plt.yticks(cticks, cluster_labels)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coefficient')
    if indicator == 0:
        str2 = 'Unscaled Silhouette Plot For {} Clusters'.format(numclusters)
        #name = 'test case {}/{} cluster/og silhouette plot'.format(run, numclusters)
    else:
        str2 = 'Min-Max Silhouette Plot For {} Clusters'.format(numclusters)
        #name = 'test case {}/{} cluster/minmax silhouette plot'.format(run, numclusters)
    plt.title(str2)
    plt.tight_layout()
    # plt.show()
    # sil_plot.savefig(name)
    return silhouette_avg

In [6]:
# Find the average silhouette number per cluster
def silhouette_cluster_average(y_sc, silhouette_values, numclusters):
    silhouette_clusters = [] 
    for i in range(numclusters):
        inds = np.where(y_sc==i)[0]
        avg_ind = sum(silhouette_values[inds])/len(inds) 
        silhouette_clusters.append(avg_ind)
    return silhouette_clusters

In [7]:
# Make the silhouette cluster an array corresponding to the image numbers
def silhouette_cluster_array(y_sc, silhouette_clusters):
    silhouette_cluster_arr = []
    for im in range(len(y_sc)):
        cat = y_sc[im]
        c = silhouette_clusters[cat]
        silhouette_cluster_arr.append(c)
    silhouette_cluster_arr = np.array(silhouette_cluster_arr)
    return silhouette_cluster_arr

In [8]:
colors = ['#1f77b4', '#ff7f0e', '#a52a2a','#2ca02c'] 
radio_colors = ['#e81005', '#f7c00a', '#364acf', '#568208']

In [9]:
df1 = pd.read_csv('/Users/amalalmansour/Downloads/MaxSlice_LIDC.csv')
#Spiculation training:
df2 = pd.read_csv('/Users/amalalmansour/Desktop/New_Images/Spiculation/Train.csv')
#df2 = pd.read_csv('/Users/amalalmansour/Desktop/New_Images/Malignancy/Train.csv')
display(df2)

Unnamed: 0,noduleID,InstanceID,Binary_Rating_Spic
0,5,118,0
1,6,175,0
2,7,179,0
3,8,195,0
4,17,297,0
...,...,...,...
577,2638,40640,0
578,2640,40658,0
579,2646,40720,0
580,2655,40771,0


In [10]:
#Spiculation testing:
df3 = pd.read_csv('/Users/amalalmansour/Desktop/New_Images/Spiculation/Test.csv')
#df3 = pd.read_csv('/Users/amalalmansour/Desktop/New_Images/Malignancy/Test.csv')
display(df3)

Unnamed: 0,InstanceID,noduleID,Binary_Rating_Spic
0,270,15,1
1,425,34,0
2,487,42,0
3,515,36,0
4,1364,105,0
...,...,...,...
140,39376,2567,0
141,39440,2571,0
142,39806,2594,0
143,40002,2618,0


In [11]:
train_ids = df2['InstanceID']
#vis_tr_te = df2['Agreement'].tolist()
labels = df2[['InstanceID', 'Binary_Rating_Spic']]
#labels = df3[['InstanceID', 'Binary_Rating_Mal']]
display(labels)

Unnamed: 0,InstanceID,Binary_Rating_Spic
0,118,0
1,175,0
2,179,0
3,195,0
4,297,0
...,...,...
577,40640,0
578,40658,0
579,40720,0
580,40771,0


In [12]:
train_slices = pd.merge(labels, df1)
display(train_slices)

Unnamed: 0,InstanceID,Binary_Rating_Spic,noduleID,DicomImage,FilePath,subtlety,internalStructure,calcification,sphericity,margin,...,Correlation,Energy,Homogeneity,Entropy,x_3rdordermoment,Inversevariance,Sumaverage,Variance,Clustertendency,MaxProbability
0,118,0,5,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0003\1.3.6.1....,4,1,6,4,5,...,0.793,0.0018,0.0764,6.43,8600000,0.0392,565.0,137000,492000,0.0052
1,175,0,6,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0004\1.3.6.1....,5,1,3,5,5,...,0.634,0.0076,0.0296,4.98,15500000,0.0097,403.0,200000,650000,0.0105
2,179,0,7,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0005\1.3.6.1....,3,1,6,4,5,...,0.489,0.0085,0.0315,4.86,1160000,0.0105,221.0,17300,51300,0.0095
3,195,0,8,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0005\1.3.6.1....,3,1,6,5,5,...,0.641,0.0038,0.0358,5.69,2830000,0.0143,320.0,59200,194000,0.0053
4,297,0,17,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0008\1.3.6.1....,3,1,6,3,3,...,0.600,0.0062,0.0247,5.19,6800000,0.0069,350.0,50300,161000,0.0071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,40640,0,2638,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-1005\1.3.6.1....,2,1,6,5,4,...,0.516,0.0049,0.0219,5.43,-5800000,0.0108,561.0,90700,276000,0.0049
578,40658,0,2640,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-1005\1.3.6.1....,3,1,6,5,5,...,0.732,0.0022,0.0359,6.25,1560000,0.0120,341.0,85700,296000,0.0042
579,40720,0,2646,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-1007\1.3.6.1....,3,1,6,3,2,...,0.636,0.0060,0.0271,5.22,19800000,0.0060,361.0,91000,297000,0.0070
580,40771,0,2655,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-1010\1.3.6.1....,3,1,6,2,5,...,0.662,0.0071,0.0334,5.05,12500000,0.0130,446.0,101000,334000,0.0090


In [13]:
test_ids = df3['InstanceID']
#vis_tr_te = df2['Agreement'].tolist()
t_labels = df3[['InstanceID', 'Binary_Rating_Spic']]
#t_labels = df3[['InstanceID', 'Binary_Rating_Mal']]
display(t_labels)

Unnamed: 0,InstanceID,Binary_Rating_Spic
0,270,1
1,425,0
2,487,0
3,515,0
4,1364,0
...,...,...
140,39376,0
141,39440,0
142,39806,0
143,40002,0


In [14]:
test_slices = pd.merge(t_labels, df1)
display(test_slices)

Unnamed: 0,InstanceID,Binary_Rating_Spic,noduleID,DicomImage,FilePath,subtlety,internalStructure,calcification,sphericity,margin,...,Correlation,Energy,Homogeneity,Entropy,x_3rdordermoment,Inversevariance,Sumaverage,Variance,Clustertendency,MaxProbability
0,270,1,15,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0007\1.3.6.1....,5,1,6,4,2,...,0.750,0.0004,0.0316,7.91,-2110000,0.0091,672.0,209000,731000,0.0039
1,425,0,34,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0012\1.3.6.1....,4,1,6,4,4,...,0.745,0.0024,0.0825,6.15,2820000,0.0427,642.0,128000,447000,0.0065
2,487,0,42,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0012\1.3.6.1....,4,1,6,4,3,...,0.469,0.0097,0.0181,4.73,10200000,0.0057,390.0,72900,213000,0.0097
3,515,0,36,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0012\1.3.6.1....,3,1,6,5,5,...,0.631,0.0061,0.0323,5.20,20500000,0.0084,534.0,104000,338000,0.0091
4,1364,0,105,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0039\1.3.6.1....,3,1,6,5,5,...,0.675,0.0057,0.0571,5.28,7070000,0.0209,534.0,127000,427000,0.0102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,39376,0,2567,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0982\1.3.6.1....,5,1,6,3,4,...,0.652,0.0044,0.0265,5.53,1320000,0.0072,368.0,87000,286000,0.0049
141,39440,0,2571,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0984\1.3.6.1....,3,1,6,3,4,...,0.391,0.0068,0.0192,5.10,1540000,0.0059,395.0,37600,104000,0.0073
142,39806,0,2594,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-0997\1.3.6.1....,4,1,3,5,4,...,0.678,0.0034,0.0288,5.78,4430000,0.0112,429.0,133000,444000,0.0055
143,40002,0,2618,present,C:\LIDC_FULL\LIDC-IDRI\LIDC-IDRI-1001\1.3.6.1....,5,1,6,5,5,...,0.599,0.0066,0.0198,5.11,18100000,0.0078,560.0,125000,398000,0.0066


In [15]:
numeric_feature_names = ['Area', 'ConvexArea', 'Perimeter', 'ConvexPerimeter', 'EquivDiameter',
                         'MajorAxisLength', 'MinorAxisLength',
                         'Elongation', 'Compactness', 'Eccentricity', 'Solidity', 'Extent',
                         'Circularity', 'RadialDistanceSD', 'SecondMoment', 'Roughness', 'MinIntensity',
                         'MaxIntensity', 'MeanIntensity', 'SDIntensity', 'MinIntensityBG',
                         'MaxIntensityBG', 'MeanIntensityBG', 'SDIntensityBG',
                         'IntensityDifference', 'markov1', 'markov2', 'markov3', 'markov4',
                         'markov5', 'gabormean_0_0', 'gaborSD_0_0', 'gabormean_0_1',
                         'gaborSD_0_1', 'gabormean_0_2', 'gaborSD_0_2', 'gabormean_1_0',
                         'gaborSD_1_0', 'gabormean_1_1', 'gaborSD_1_1', 'gabormean_1_2',
                         'gaborSD_1_2', 'gabormean_2_0', 'gaborSD_2_0', 'gabormean_2_1',
                         'gaborSD_2_1', 'gabormean_2_2', 'gaborSD_2_2', 'gabormean_3_0',
                         'gaborSD_3_0', 'gabormean_3_1', 'gaborSD_3_1', 'gabormean_3_2',
                         'gaborSD_3_2', 'Contrast', 'Correlation', 'Energy', 'Homogeneity',
                         'Entropy', 'x_3rdordermoment', 'Inversevariance', 'Sumaverage',
                         'Variance', 'Clustertendency']

In [16]:
df_feature = train_slices[numeric_feature_names]
df_label = train_slices['Binary_Rating_Spic']
#df_label = train_slices['Binary_Rating_Mal']
df_t_feature = test_slices[numeric_feature_names]
df_t_label = test_slices['Binary_Rating_Spic']
#df_t_label = train_slices['Binary_Rating_Mal']
#df_feature
#df_label

In [17]:
df_feature = pd.DataFrame(scale.fit_transform(df_feature.values), columns= numeric_feature_names)
df_t_feature = pd.DataFrame(scale.fit_transform(df_t_feature.values), columns= numeric_feature_names)

In [18]:
df_feature = np.array(df_feature)
df_label = np.array(df_label)
df_t_feature = np.array(df_t_feature)
df_t_label = np.array(df_t_label)

In [19]:
df_label[3]

0

In [20]:
df_label.shape

(582,)

In [21]:
df_label[3]

0

In [22]:
df_feature[3]

array([0.08286252, 0.07838284, 0.11144233, 0.16386317, 0.19321027,
       0.18352446, 0.19413001, 0.14411694, 0.04710665, 0.63911818,
       0.80520759, 0.58144573, 0.86005664, 0.13782457, 0.06065539,
       0.11004146, 0.7320132 , 0.3206064 , 0.49868198, 0.17119999,
       0.73024691, 0.36057392, 0.59126459, 0.0472866 , 0.19645905,
       0.0208902 , 0.01826769, 0.07131173, 0.06458299, 0.00786771,
       0.79278405, 0.75190501, 0.60708656, 0.84963861, 0.58862052,
       0.67270562, 0.40833163, 0.63988422, 0.53034415, 0.5646822 ,
       0.44979138, 0.70739794, 0.68027774, 0.71020549, 0.50261174,
       0.63428363, 0.59315883, 0.67829356, 0.50161918, 0.6717715 ,
       0.55720581, 0.64122379, 0.46776794, 0.67623739, 0.03569825,
       0.60746269, 0.19125683, 0.22054381, 0.38809524, 0.4289528 ,
       0.16708543, 0.18533605, 0.04390575, 0.04023641])

In [23]:
# define the keras model
model = Sequential()
model.add(Dense(36, input_shape=(64,), activation='relu'))
#model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#model.add(Dense(3,activation='softmax'))
# compile the keras model
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Compile the model
model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
# fit the keras model on the dataset
history = model.fit(df_feature, df_label, epochs=100, batch_size=10, verbose=0, validation_data=(df_t_feature, df_t_label))
# make class predictions with the model
#predictions = (model.predict(df_feature) > 0.5).astype(int)

ValueError: in user code:

    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 809, in train_step
        loss = self.compiled_loss(
    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/losses.py", line 1807, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "/Users/amalalmansour/miniforge3/envs/env_tensorflow/lib/python3.9/site-packages/keras/backend.py", line 5158, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)

    ValueError: `logits` and `labels` must have the same shape, received ((None, 2) vs (None, 1)).


In [24]:
model.save("/Users/amalalmansour/Desktop/New_Images/Spiculation/NN/NN_model_softmax")
#model.save("/Users/amalalmansour/Desktop/New_Images/Malignancy/NN/NN_model_All")

In [25]:
# evaluate the keras model
y_pred, accuracy = model.evaluate(df_feature, df_label)
print('Accuracy: %.2f' % (accuracy*100))

In [26]:
t_predictions = (model.predict(df_t_feature) > 0.5).astype(int)
len(t_predictions)

In [27]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

confusion_matrix = confusion_matrix(df_t_label,  np.argmax(t_predictions,axis=1))
#confusion_matrix = multilabel_confusion_matrix(df_t_label, t_predictions)
cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix)
cm_display.plot()
plt.show()


In [28]:
# evaluate the keras model
y_pred, accuracy = model.evaluate(df_t_feature, df_t_label)
print('Accuracy: %.2f' % (accuracy*100))

In [29]:
history_dict = history.history
#history_dict['val_accuracy']

In [30]:
epoch_count = range(1,100+1)
plt.plot(epoch_count, history_dict['loss'])
plt.plot(epoch_count, history_dict['val_loss'])
plt.legend(['Training Loss','Testing Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [31]:
epoch_count = range(1,100+1)
plt.plot(epoch_count, history_dict['accuracy'])
plt.plot(epoch_count, history_dict['val_accuracy'])
plt.legend(['Training accuracy','Testing accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()