In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
from sklearn.metrics import confusion_matrix

df = pd.read_csv(".\\dataset\\Consolidated_CancerSEEK_Data.csv", low_memory = False)
array = df.values

In [None]:
# tumor types
np.unique(array[:, 5], return_index=False, return_inverse=False, return_counts=False, axis=None)

In [None]:
cancerTypes = ['Breast', 'Colorectum', 'Esophagus', 'Liver', 'Lung', 'Normal',
       'Ovary', 'Pancreas', 'Stomach'] 
cancerTypeCases = []

# cases for each cancer type
for i in range(len(cancerTypes)):
    cancerTypeCases.append(array[:, 5][array[:, 5] == cancerTypes[i]].shape[0])
    print(cancerTypes[i] + " cancer cases: " + str(cancerTypeCases[i]))

# note 'normal cancer' are control patients

In [None]:
names = ['Breast', 'Colorectum', 'Esophagus', 'Liver', 'Lung', 'Ovary', 'Pancreas', 'Stomach'] 
size = [209, 388, 45, 44, 104, 54, 93, 68]
 
my_circle = plt.Circle( (0,0), 0.5, color='white')

plt.pie(size, labels=names, colors=['royalblue', 'darkturquoise', 'mediumblue', 
                                    'steelblue', 'skyblue', 'lightsteelblue', 'navy', 'lightskyblue'])

# pie chart of cancer types
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.title("Types of Cancer")
plt.show()

In [None]:
# stage types
np.unique(array[1:, 6], return_index=False, return_inverse=False, return_counts=False, axis=None)

In [None]:
stageTypes = ['0', 'I', 'II', 'III']
stageTypeCases = []

for i in range(len(stageTypes)):
    stageTypeCases.append(array[1:, 6][array[1:, 6] == stageTypes[i]].shape[0])
    print("Stage " + stageTypes[i] + " cancer cases: " + str(stageTypeCases[i]))

In [None]:
names = ['I', 'II', 'III'] 
size = [198, 497, 309]
 
my_circle = plt.Circle( (0,0), 0.5, color='white')

plt.pie(size, labels=names, colors=['royalblue', 'darkturquoise', 'mediumblue'])

# pie charts of stage types
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.title("Stage of Cancer")
plt.show()

In [None]:
# distribution of control and cancer patients    
cancerPatient = 0
controlPatient = 0
totalCases = 1817.
for value in array[:, 49]:
    if value == 1:
        cancerPatient += 1
    else:
        controlPatient += 1
        
print("Total number of cancer patients: " + str(cancerPatient))
print("Total number of healthy patients: " + str(controlPatient))

In [None]:
tpr = 0
tnr = 0
fpr = 0
fnr = 0

predAndActual = array[:, 48:]

# counting tpr, tnr, fpr, fnr cases
for i in range(len(predAndActual)):
    if predAndActual[i,0] == 0:
        if predAndActual[i,0] == predAndActual[i,1]:
            tnr += 1
        else:
            fnr += 1
    else:
        if predAndActual[i,0] == predAndActual[i,1]:
            tpr += 1
        else:
            fpr += 1

print("Number of true positives: " + str(tpr))
print("Number of true negatives: " + str(tnr))
print("Number of false positives: " + str(fpr))
print("Number of false negatives: " + str(fnr))

In [None]:
# creating confusion matrix 
cf_matrix = np.zeros((2, 2))
cf_matrix[0,0] = 805
cf_matrix[0,1] = 7
cf_matrix[1,0] = 379
cf_matrix[1,1] = 626
print(cf_matrix)

In [None]:
# plotting confusion matrix
group_names = ['True Negative','False Positive','False Negative','True Positive']
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]

labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
ax = plt.axes()
sns.heatmap(cf_matrix, annot=labels, fmt = '', cmap='Blues', ax = ax)
ax.set_title('Confusion Matrix for CancerSEEK accuracy')
plt.show()

In [None]:
# initializing variables to count fnr or tpr / stage types
tpr = 0
tnr = 0
fpr = 0
fnr = 0
count11 = 0
count12 = 0
count13 = 0
count21 = 0
count22 = 0
count23 = 0

predAndActual = array[:, 48:]

for i in range(len(predAndActual)):
    if predAndActual[i,0] == 0:
        if predAndActual[i,0] == predAndActual[i,1]:
            tnr += 1
        else:
            fnr += 1
            if array[i,6] == 'I':
                count11 += 1
            elif array[i,6] == 'II':
                count12 += 1
            elif array[i,6] == 'III':
                count13 += 1    
                
            
    else:
        if predAndActual[i,0] == predAndActual[i,1]:
            tpr += 1
            if array[i,6] == 'I':
                count21 += 1
            elif array[i,6] == 'II':
                count22 += 1
            elif array[i,6] == 'III':
                count23 += 1
        else:
            fpr += 1   

print("False negatives")
print("Stage 1: " + str(count11))
print("Stage 2: " + str(count12))
print("Stage 3: " + str(count13))
print("True positives")
print("Stage 1: " + str(count21))
print("Stage 2: " + str(count22))
print("Stage 3: " + str(count23))

In [None]:
X = ['Type I','Type II','Type III']
fnr = [104/(104+95), 183/(183+314), 92/(92+217)]
tpr = [95/(104+95), 314/(183+314), 217/(92+217)]
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, fnr, 0.4, label = '% of false negatives', color = 'darkblue')
plt.bar(X_axis + 0.2, tpr, 0.4, label = '% of true positives', color = 'blue')
  
plt.xticks(X_axis, X)
plt.title("Classification of Cancer Patients")
plt.legend()
plt.show()

In [None]:
# correlation matrix
matrix = df.corr().values
print(matrix[39])

In [None]:
# plotting correlation matrix
sns.heatmap(df.corr(), cmap="Blues")