### Imports

In [19]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


### Recolha de dados

In [20]:
# 64 patients with breast cancer and 52 healthy controls, 116 in total
# 1 - paciente sem cancro, 2 - paciente com cancro

file = pd.read_excel("../Data/dataR2.XLSX")
DataSet = pd.DataFrame(data=file.dropna()) #Remove NaN because of initial empty lines

# Find the indexes where the differente class samples are located
ixHealthy = np.where(DataSet['Classification'] == 1)
ixWithCancer = np.where(DataSet['Classification'] == 2)

# Definir classes
Classes = ["healthy","withCancer"]


#DataSet['Classification'].iloc[ixHealthy] = Classes[0] 
#DataSet['Classification'].iloc[ixWithCancer] = Classes[1] 

#for w in DataSet.columns:
#    for i in DataSet.columns: 
#        if w == 'Classification' or i == 'Classification' or w == i:
#            continue
#        fig = px.scatter_3d(x = DataSet.index, y = DataSet[w], z = DataSet[i], color = DataSet['Classification'], labels=dict(x = "N", y = w, z = i , color = "Classification"))
#        #fig = px.scatter_3d(x = CorkStopp["N.1"], y = CorkStopp["ARM"],z = CorkStopp["PRM"],color = CorkStopp['C'], labels=dict(x = "N", y = "ARM", color = "Cork Stoppers"))
#
#        fig.update_traces(marker_size=10)
#        fig.show()

# Partição
x = DataSet.iloc[:, :-1] # todas menos a última coluna

y = DataSet.iloc[:, -1] # apenas a última coluna


x_train, x_testInicial, y_train, y_testInicial = train_test_split( # Partições entre train e test
    DataSet, y,
    test_size=0.2,       # 20% dos dados para teste
    random_state=67,     # garante reprodutibilidade
    shuffle=True,
    stratify = y           # mantém a proporção das classes TODO: se calhar tirar depois
)


x_val, x_test, y_val, y_test = train_test_split( # Partições entre test e validation
    x_testInicial, y_testInicial,
    test_size=0.5,
    random_state=67,
    stratify=y_testInicial  # mantém a proporção das classes TODO: se calhar tirar depois
)



### Relevancy test

In [21]:
# Kruskall

X= x.to_numpy()
X=X.astype(float)

fnames=x.columns

Hs={}
for i in range(np.shape(X)[1]):
    st=stats.kruskal(X[ixHealthy,i].flatten(),X[ixWithCancer,i].flatten())
    Hs[fnames[i]]=st.statistic


Hs = sorted(Hs.items(), key=lambda x: x[1],reverse=True)  

print("Ranked features")

for f in Hs:
    print(f[0]+"-->"+str(f[1]))

#print(X)


### Redundancy test

In [22]:

# X = np.array([x[h[0]] for h in Hs])

# # Nomes das features
# labels = [h[0] for h in Hs]

# corrMat=np.corrcoef(X)
# fig = px.imshow(corrMat, text_auto=True,labels=dict(x="Features", y="Features", color="Correlation"),
#                 x=labels,
#                 y=labels,
#                 width=800, height=800,color_continuous_scale=px.colors.sequential.gray)
# fig.show()


firstFive = Hs[:]
corr = x[[f[0] for f in firstFive]].corr()
print(corr)


# HOMA - Insulin 0.932 Muito alta -> redundância clara


In [23]:
ixHealthCancer=np.concatenate((ixHealthy[0],ixWithCancer[0]))
y=DataSet['Classification'].to_numpy()[ixHealthCancer]


roc_auc=np.zeros(fnames.shape)
i=0
for f in fnames:#Go along features
    fpr, tpr, _= roc_curve(y,DataSet[f].to_numpy()[ixHealthCancer],pos_label=2)
    
    figR = go.Figure()
    figR.add_scatter(x=fpr, y=tpr,mode='lines+markers')
    figR.update_layout(autosize=False,width=700,height=700,title=dict(text=f))
    figR.update_xaxes(title_text="1-SP",range=[-0.01, 1.01])
    figR.update_yaxes(title_text="SS",range=[-0.01, 1.01])
    

    roc_auc[i] = auc(fpr, tpr)#Compute area under the ROC curve
    
    figR.add_annotation(x=0.5, y=0.5,
            text="AUC: "+str(roc_auc[i]),
            showarrow=False,
            yshift=10)
    figR.show()
    i=i+1

sortIx=np.flip(np.argsort(roc_auc))#Sort using AUC
print("Sorting accourding to ROC-AUC:")
for i in sortIx:
    print(fnames[i]+"-->"+str(roc_auc[i]))
print("\n")
print("Sorting accourding to Kruskall-Wallis:")

for f in Hs:
    print(f[0]+"-->"+str(f[1]))

### PCA

In [None]:
#Normalize
X=(x-np.mean(x,axis=0))/np.std(x,axis=0)

pca = PCA()
pca.fit(X)
# #PCA eigenvalues/Explained variance
# print("PCA eigenvalues/Explained variance")
# print(pca.explained_variance_)
# print("Sum of eigenvalues="+str(np.sum(pca.explained_variance_)))
# #PCA eigenvectors/Principal components
# print("PCA eigenvectors/Principal components")
# W=pca.components_.T
# print(W)

print("Variance (%) retained accourding to Kaiser: "+str(pca.explained_variance_[0]**2/(np.sum(pca.explained_variance_**2))*100))
print("Variance (%) retained accourding to Scree: "+str(np.sum(pca.explained_variance_[0:6]**2)/(np.sum(pca.explained_variance_**2))*100))


pca2=PCA(n_components=1)

X1D=pca2.fit_transform(X)
#print(np.shape(X1D))

#Plot projected data
fig = px.scatter(x=X1D[:, 0],y=np.zeros(np.shape(X1D)[0]),color=DataSet['Classification'].replace({1: "Healthy", 2: "With Cancer"}),labels=dict(x="PC1", y="", color="Classification"))

fig.update_traces(marker_size=8)
fig.update_xaxes
fig.show()



### MDC

In [32]:
NHealthy=DataSet.index.to_numpy()[ixHealthy]
NCancer=DataSet.index.to_numpy()[ixWithCancer]

gluHealthy=(DataSet['Glucose'].to_numpy())[ixHealthy]
gluCancer=(DataSet['Glucose'].to_numpy())[ixWithCancer]


muHealthy=np.array([[np.mean(NHealthy),np.mean(gluHealthy)]]).T
muCancer=np.array([[np.mean(NCancer),np.mean(gluCancer)]]).T

print("gHealthy(x)="+str(muHealthy.T)+"x-0.5"+str(muHealthy.T@muHealthy))
print("gCancer(x)="+str(muCancer.T)+"x-0.5"+str(muCancer.T@muCancer)+"\n")


X1=np.array([NHealthy,gluHealthy]).T
X2=np.array([NCancer,gluCancer]).T
X=np.concatenate((X1,X2),axis=0)
y=np.concatenate((DataSet['Classification'].to_numpy()[ixHealthy],DataSet['Classification'].to_numpy()[ixWithCancer]))

yp=np.ones(np.shape(y))

dx=((muHealthy-muCancer).T@(X.T-0.5*(muHealthy+muCancer))).flatten()
yp[dx<0]=2

Hits=np.shape(np.where((y==yp))[0])[0]
TP=np.shape(np.where((y[ixHealthy]==yp[ixHealthy]))[0])[0]
TN=np.shape(np.where((y[ixWithCancer]==yp[ixWithCancer]))[0])[0]
FP=np.shape(np.where((y[ixWithCancer]!=yp[ixWithCancer]))[0])[0]
FN=np.shape(np.where((y[ixHealthy]!=yp[ixHealthy]))[0])[0]

SS=TP/(TP+FN)
SP=TN/(TN+FP)
PR=TP/(TP+FP)
F1Score=2*(PR*SS)/(PR+SS)
AC=(TN+TP)/(TP+TN+FP+FN)


print("Sensitivity(%)="+str(SS*100))
print("Specificity(%)="+str(SP*100))
print("Precision(%)="+str(PR*100))
print("F1Score(%)="+str(F1Score*100))
print("Accuracy(%)="+str(AC*100))




gHealthy(x)=[[25.5        88.23076923]]x-0.5[[8434.91863905]]
gCancer(x)=[[ 83.5    105.5625]]x-0.5[[18115.69140625]]

Sensitivity(%)=100.0
Specificity(%)=93.75
Precision(%)=92.85714285714286
F1Score(%)=96.2962962962963
Accuracy(%)=96.55172413793103
