### 1. Imports

In [289]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


### 2. Recolha de dados

In [290]:
# 64 patients with breast cancer and 52 healthy controls, 116 in total
# 1 - paciente sem cancro, 2 - paciente com cancro

file = pd.read_excel("../Data/dataR2.XLSX")
DataSet = pd.DataFrame(data=file.dropna()) #Remove NaN because of initial empty lines

ixHealthy = np.where(DataSet['Classification'] == 1)
ixWithCancer = np.where(DataSet['Classification'] == 2)

x = DataSet.iloc[:, :-1] # todas menos a última coluna
Y = DataSet.iloc[:, -1] # apenas a última coluna

print("Shape:", X.shape)
print("\nClass balance:")
print(Y.value_counts().rename({1:"Healthy", 2:"With Cancer"}))


Shape: (92, 1)

Class balance:
Classification
With Cancer    64
Healthy        52
Name: count, dtype: int64


### 3. Data partitioning (Train / Validation / Test)

In [291]:
x_train, x_temp, y_train, y_temp = train_test_split( # Partições entre train e test
    x, Y,
    test_size=0.2,       # 20% dos dados para teste
    random_state=67,     # garante reprodutibilidade (seed)
    shuffle=True,
    stratify = Y          # mantém a proporção das classes
)

x_val, x_test, y_val, y_test = train_test_split( # Partições entre test e validation
    x_temp, y_temp,
    test_size=0.5,
    random_state=67,
    stratify=y_temp  # mantém a proporção das classes
)

ixHealthyTrain = np.where(y_train == 1)
ixWithCancerTrain = np.where(y_train == 2)

print("Train:", x_train.shape, "Val:", x_val.shape, "Test:", x_test.shape)

Train: (92, 9) Val: (12, 9) Test: (12, 9)


### 4. Relevancy test

In [292]:
# Kruskall

X= x_train.to_numpy()
X=X.astype(float)

fnames=x_train.columns

Hs={}
for i in range(np.shape(X)[1]):
    st=stats.kruskal(X[ixHealthyTrain,i].flatten(),X[ixWithCancerTrain,i].flatten())
    Hs[fnames[i]]=st.statistic


Hs = sorted(Hs.items(), key=lambda x: x[1],reverse=True)  

print("Ranked features")

for f in Hs:
    print(f[0]+"-->"+str(f[1]))


Ranked features
Glucose-->16.960636026690125
Resistin-->8.243640178337273
HOMA-->7.028972092377501
Insulin-->4.448765747437663
BMI-->2.4315246355703573
MCP.1-->0.9875036349333824
Adiponectin-->0.19007728976720273
Leptin-->0.14515357677294105
Age-->0.06324105726834488


In [293]:
ixHealthCancer=np.concatenate((ixHealthyTrain[0],ixWithCancerTrain[0]))
y=DataSet['Classification'].to_numpy()[ixHealthCancer]


roc_auc=np.zeros(fnames.shape)
i=0
for f in fnames:#Go along features
    fpr, tpr, _= roc_curve(y,DataSet[f].to_numpy()[ixHealthCancer],pos_label=2)
    
#    figR = go.Figure()
#    figR.add_scatter(x=fpr, y=tpr,mode='lines+markers')
#    figR.update_layout(autosize=False,width=700,height=700,title=dict(text=f))
#    figR.update_xaxes(title_text="1-SP",range=[-0.01, 1.01])
#    figR.update_yaxes(title_text="SS",range=[-0.01, 1.01])
    

    roc_auc[i] = auc(fpr, tpr)#Compute area under the ROC curve
    
#    figR.add_annotation(x=0.5, y=0.5,
#            text="AUC: "+str(roc_auc[i]),
#            showarrow=False,
#            yshift=10)
#    figR.show()
    i=i+1

sortIx=np.flip(np.argsort(roc_auc))#Sort using AUC
print("Sorting accourding to ROC-AUC:")
for i in sortIx:
    print(fnames[i]+"-->"+str(roc_auc[i]))
print("\n")
print("Sorting accourding to Kruskall-Wallis:")

for f in Hs:
    print(f[0]+"-->"+str(f[1]))

Sorting accourding to ROC-AUC:
Glucose-->0.7701923076923077
Resistin-->0.7028846153846154
HOMA-->0.6442307692307693
MCP.1-->0.6120192307692307
Insulin-->0.6012019230769231
Adiponectin-->0.5341346153846154
Age-->0.4300480769230769
Leptin-->0.4096153846153846
BMI-->0.32091346153846156


Sorting accourding to Kruskall-Wallis:
Glucose-->16.960636026690125
Resistin-->8.243640178337273
HOMA-->7.028972092377501
Insulin-->4.448765747437663
BMI-->2.4315246355703573
MCP.1-->0.9875036349333824
Adiponectin-->0.19007728976720273
Leptin-->0.14515357677294105
Age-->0.06324105726834488


In [294]:
# --- Seleção de features com base em Kruskal e AUC ---

# Converter Hs (lista de tuplas) para dicionário para acesso rápido
Hs_dict = dict(Hs)

# Calcular médias
mean_H = np.mean(list(Hs_dict.values()))
mean_auc = np.mean(roc_auc)

print(f"\nMédia H (Kruskal): {mean_H}")
print(f"Média AUC: {mean_auc}\n")

# Selecionar features que passam ambos os critérios
selected_features = [
    f for f in fnames 
    if (Hs_dict[f] > mean_H) and (roc_auc[np.where(fnames == f)[0][0]] > mean_auc)
]

print("Features selecionadas (acima da média em ambos os critérios):")
for f in selected_features:
    print(f)

# Criar novo DataFrame apenas com essas features
x_train = x_train[selected_features]


print(f"\nNúmero total de features originais: {len(fnames)}")
print(f"Número de features selecionadas: {x_train.shape[1]}")



Média H (Kruskal): 4.499946026572754
Média AUC: 0.5583600427350428

Features selecionadas (acima da média em ambos os critérios):
Glucose
HOMA
Resistin

Número total de features originais: 9
Número de features selecionadas: 3


### Redundancy test

In [295]:

Features = Hs[:]
corr = x[[f[0] for f in Features]].corr()
print(corr)

corrAfter = x_train.corr()
print("\n",corrAfter)

# HOMA - Insulin 0.932 Muito alta -> redundância clara




              Glucose  Resistin      HOMA   Insulin       BMI     MCP.1  \
Glucose      1.000000  0.291327  0.696212  0.504653  0.138845  0.264879   
Resistin     0.291327  1.000000  0.231101  0.146731  0.195350  0.366474   
HOMA         0.696212  0.231101  1.000000  0.932198  0.114480  0.259529   
Insulin      0.504653  0.146731  0.932198  1.000000  0.145295  0.174356   
BMI          0.138845  0.195350  0.114480  0.145295  1.000000  0.224038   
MCP.1        0.264879  0.366474  0.259529  0.174356  0.224038  1.000000   
Adiponectin -0.122121 -0.252363 -0.056337 -0.031296 -0.302735 -0.200694   
Leptin       0.305080  0.256234  0.327210  0.301462  0.569593  0.014009   
Age          0.230106  0.002742  0.127033  0.032495  0.008530  0.013462   

             Adiponectin    Leptin       Age  
Glucose        -0.122121  0.305080  0.230106  
Resistin       -0.252363  0.256234  0.002742  
HOMA           -0.056337  0.327210  0.127033  
Insulin        -0.031296  0.301462  0.032495  
BMI           

### PCA

In [296]:
#Normalize
X=(x_train-np.mean(x_train,axis=0))/np.std(x_train,axis=0)

pca = PCA()
pca.fit(X)


y_train = pd.Series(y_train)

fig = px.scatter(
    x=np.arange(1, len(pca.explained_variance_) + 1),
    y=pca.explained_variance_,
    labels=dict(x="PC", y="Explained Variance")
)

fig.add_hline(y=1,line_width=3, line_dash="dash", line_color="red")
fig.update_traces(marker_size=10)
fig.show()

print("Variance (%) retained accourding to Kaiser: "+str(pca.explained_variance_[0]**2/(np.sum(pca.explained_variance_**2))*100))
print("Variance (%) retained accourding to Scree: "+str(np.sum(pca.explained_variance_[0:6]**2)/(np.sum(pca.explained_variance_**2))*100))


pca2=PCA(n_components=1)

X1D=pca2.fit_transform(X)
#print(np.shape(X1D))

#Plot projected data
fig = px.scatter(x=X1D[:, 0],y=np.zeros(np.shape(X1D)[0]),color=y_train.replace({1: "Healthy", 2: "With Cancer"}),labels=dict(x="PC1", y="", color="Classification"))

fig.update_traces(marker_size=8)
fig.update_xaxes
fig.show()


# Guardar os dois primeiros PCAs

x_pca=PCA(n_components=2)
X_pca = pca.fit_transform(X)

PC1 = X_pca[:, 0]
PC2 = X_pca[:, 1]



Variance (%) retained accourding to Kaiser: 83.34084305904138
Variance (%) retained accourding to Scree: 100.0


### LDA

In [297]:
lda = LinearDiscriminantAnalysis(n_components=1)
# prepare transform on dataset
lda.fit(x_train,y_train)
# apply transform to dataset
transformed = lda.transform(x_train)

# Converter classes para nomes legíveis (1 → Healthy, 2 → With Cancer)
y_labels = y_train.replace({1: "Healthy", 2: "With Cancer"})

#Plot transformed data
# Plot 1D estilo linha reta
fig = px.scatter(
    x=transformed.squeeze(),
    y=np.zeros(transformed.shape[0]),
    color=y_labels,
    labels=dict(x="LDA1", y="", color="Classification"),
    title="LDA - Projeção 1D das Classes"
)

fig.show()

# Guardar os dois primeiros LDAs

lda = LinearDiscriminantAnalysis(n_components=1)
X_lda = lda.fit_transform(X, y_train)

# Como só temos 2 classes, só temos uma componente 
LD1 = X_lda[:, 0]

### MDC

In [298]:
PC1 = np.array(PC1)
PC2 = np.array(PC2)
y = y_train.to_numpy()

ixHealthy = np.where(y == 1)[0]
ixCancer  = np.where(y == 2)[0]

PC1_Healthy = PC1[ixHealthy]
PC2_Healthy = PC2[ixHealthy]
PC1_Cancer  = PC1[ixCancer]
PC2_Cancer  = PC2[ixCancer]


muHealthy = np.array([[np.mean(PC1_Healthy), np.mean(PC2_Healthy)]]).T
muCancer  = np.array([[np.mean(PC1_Cancer), np.mean(PC2_Cancer)]]).T

print("gHealthy(x)="+str(muHealthy.T)+"x-0.5"+str(muHealthy.T@muHealthy))
print("gCancer(x)="+str(muCancer.T)+"x-0.5"+str(muCancer.T@muCancer)+"\n")


X1 = np.array([PC1_Healthy, PC2_Healthy]).T
X2 = np.array([PC1_Cancer,  PC2_Cancer]).T
X=np.concatenate((X1,X2),axis=0)
y = np.concatenate((y_train.to_numpy()[ixHealthyTrain],y_train.to_numpy()[ixWithCancerTrain]))


yp=np.ones(np.shape(y))

dx=((muHealthy-muCancer).T@(X.T-0.5*(muHealthy+muCancer))).flatten()
yp[dx<0]=2

Hits=np.shape(np.where((y==yp))[0])[0]

ixHealthy = np.where(y == 1)[0]
ixCancer = np.where(y == 2)[0]

TP=np.shape(np.where((y[ixHealthy]==yp[ixHealthy]))[0])[0]
TN=np.shape(np.where((y[ixCancer]==yp[ixCancer]))[0])[0]
FP=np.shape(np.where((y[ixCancer]!=yp[ixCancer]))[0])[0]
FN=np.shape(np.where((y[ixHealthy]!=yp[ixHealthy]))[0])[0]

SS=TP/(TP+FN)
SP=TN/(TN+FP)
PR=TP/(TP+FP)
F1Score=2*(PR*SS)/(PR+SS)
AC=(TN+TP)/(TP+TN+FP+FN)


print("Sensitivity(%)="+str(SS*100))
print("Specificity(%)="+str(SP*100))
print("Precision(%)="+str(PR*100))
print("F1Score(%)="+str(F1Score*100))
print("Accuracy(%)="+str(AC*100))


fig=go.Figure()
fig.add_trace(go.Scatter(x=PC1_Healthy,y=PC2_Healthy,name='Healthy'))
#fig.update_traces(marker=dict(color='green', size=10))
fig.add_trace(go.Scatter(x=PC1_Cancer,y=PC2_Cancer,name='Cancer'))
fig.update_traces(marker=dict(size=10))


ixFP = np.where((y == 2) & (yp == 1))[0]
ixFN = np.where((y == 1) & (yp == 2))[0]


fig.add_trace(go.Scatter(x=X[ixFN,0],y=X[ixFN,1],name='False Healthy',marker_size=15,
                         marker_symbol="circle-open",marker=dict(color="red",
                                                                 line=dict(width=3,color='red'))))
fig.add_trace(go.Scatter(x=X[ixFP,0],y=X[ixFP,1],name='False Cancer',marker_size=15,
                         marker_symbol="circle-open",marker=dict(color="blue",
                                                                 line=dict(width=3,color='blue'))))


fig.add_trace(go.Scatter(x=muHealthy[0],y=muHealthy[1], marker_size=20,
                         marker_symbol='x',name='Healthy mean',
                         marker=dict(color="green",line=dict(width=1,
                                        color='black'))))
fig.add_trace(go.Scatter(x=muCancer[0],y=muCancer[1], marker_size=20,
                         marker_symbol='x',name='Cancer mean',
                         marker=dict(color="yellow",line=dict(width=1,
                                        color='black'))))
fig.add_trace(go.Scatter(x=(muHealthy[0]+muCancer[0])/2,y=(muHealthy[1]+muCancer[1])/2, marker_size=20,
                         marker_symbol="circle-dot",name='Mean of the means',
                         marker=dict(color="purple",line=dict(width=1,
                                        color='black'))))

fig.update_traces(mode='markers')
fig.add_trace(go.Scatter(x=[muHealthy[0][0],muCancer[0][0]],y=[muHealthy[1][0],muCancer[1][0]],
                         mode='lines', line=dict(color="black",width=4),name='Inter mean segment'))


W=(muHealthy-muCancer)
b=-0.5*(muHealthy-muCancer).T@(muHealthy+muCancer)

x1=np.arange(20,130,1)

x2=-(W[0,0]/W[1,0])*x1-b/W[1,0]
fig.add_trace(go.Scatter(x=x1,y=x2.flatten(),
                         mode='lines', line=dict(dash='dash',color="gray",width=4),name='Hyperplane'))





fig.update_xaxes(title_text='PC1')
fig.update_yaxes(title_text='PC2')
fig.update_layout(
    title="Euclidean MDC - PCA1 vs PCA2",
    autosize=False,
    width=900,
    height=800)
fig.show()






ixHealthy = np.where(y == 1)[0]
ixCancer  = np.where(y == 2)[0]

LD1_Healthy = LD1[ixHealthy]
LD1_Cancer  = LD1[ixCancer]

# Médias projetadas
muHealthy = np.mean(LD1_Healthy)
muCancer  = np.mean(LD1_Cancer)

print("gHealthy(x) = " + str(muHealthy) + "x - 0.5" + str(muHealthy**2))
print("gCancer(x)  = " + str(muCancer) + "x - 0.5" + str(muCancer**2) + "\n")

X = LD1.reshape(-1, 1)
y = y_train.to_numpy()

# Classificação Euclidiana
yp = np.ones_like(y)
dx = (muHealthy - muCancer) * (X.flatten() - 0.5 * (muHealthy + muCancer))
yp[dx < 0] = 2

# Métricas
TP = np.sum((y == 1) & (yp == 1))
TN = np.sum((y == 2) & (yp == 2))
FP = np.sum((y == 2) & (yp == 1))
FN = np.sum((y == 1) & (yp == 2))

SS = TP / (TP + FN)
SP = TN / (TN + FP)
PR = TP / (TP + FP)
F1Score = 2 * (PR * SS) / (PR + SS)
AC = (TN + TP) / len(y)

print("Sensitivity(%) =", SS * 100)
print("Specificity(%) =", SP * 100)
print("Precision(%) =", PR * 100)
print("F1Score(%) =", F1Score * 100)
print("Accuracy(%) =", AC * 100)

# --- Gráfico ---
import plotly.graph_objects as go

fig = go.Figure()

# Pontos Healthy e Cancer
fig.add_trace(go.Scatter(
    x=LD1_Healthy,
    y=np.zeros_like(LD1_Healthy),
    name='Healthy',
    mode='markers',
    marker=dict(color='green', size=10)
))
fig.add_trace(go.Scatter(
    x=LD1_Cancer,
    y=np.zeros_like(LD1_Cancer),
    name='Cancer',
    mode='markers',
    marker=dict(color='orange', size=10)
))

# Médias
fig.add_trace(go.Scatter(
    x=[muHealthy], y=[0],
    name='Healthy mean',
    mode='markers',
    marker=dict(color='green', size=15, symbol='x')
))
fig.add_trace(go.Scatter(
    x=[muCancer], y=[0],
    name='Cancer mean',
    mode='markers',
    marker=dict(color='yellow', size=15, symbol='x')
))

# Hiperplano de decisão (ponto médio)
decision_boundary = 0.5 * (muHealthy + muCancer)
fig.add_trace(go.Scatter(
    x=[decision_boundary],
    y=[0],
    name='Decision Boundary',
    mode='markers',
    marker=dict(color='purple', size=15, symbol='circle')
))

# Linha entre médias
fig.add_trace(go.Scatter(
    x=[muHealthy, muCancer],
    y=[0, 0],
    mode='lines',
    line=dict(color="black", width=3),
    name='Inter mean segment'
))

fig.update_layout(
    title="Euclidean MDC - LDA1",
    xaxis_title="LDA1",
    yaxis_visible=False,
    width=900,
    height=400,
    showlegend=True
)

fig.show()



gHealthy(x)=[[-0.5726661  -0.03638699]]x-0.5[[0.32927047]]
gCancer(x)=[[0.46037863 0.02925229]]x-0.5[[0.21280418]]

Sensitivity(%)=90.2439024390244
Specificity(%)=52.94117647058824
Precision(%)=60.65573770491803
F1Score(%)=72.54901960784315
Accuracy(%)=69.56521739130434


gHealthy(x) = 0.04707143408153153x - 0.50.002215719906491968
gCancer(x)  = -0.037841741124368576x - 0.50.0014319973713237278

Sensitivity(%) = 12.195121951219512
Specificity(%) = 47.05882352941176
Precision(%) = 15.625
F1Score(%) = 13.698630136986301
Accuracy(%) = 31.521739130434785


### Mahalanobis

In [299]:


CHealthy=np.cov(np.array([NHealthy,gluHealthy]))
CCancer=np.cov(np.array([NCancer,gluCancer]))

C=(CHealthy+CCancer)/2
Ci=np.linalg.inv(C)
print(Ci)
            
print("gSup(x)="+str(muHealthy.T@Ci)+"x-0.5"+str(muHealthy.T@Ci@muHealthy))
print("gAvg(x)="+str(muCancer.T@Ci)+"x-0.5"+str(muCancer.T@Ci@muCancer))

print("d(x)="+str((muHealthy-muCancer).T@Ci)+"x-0.5"+str((muHealthy-muCancer).T@Ci@(muHealthy+muCancer)))


X1=np.array([NHealthy,gluHealthy]).T
X2=np.array([NCancer,gluCancer]).T
X=np.concatenate((X1,X2),axis=0)
y=np.concatenate((y_train.to_numpy()[ixHealthy],y_train.to_numpy()[ixWithCancerTrain]))

yp=np.ones(np.shape(y))

dx=((muHealthy-muCancer).T@Ci@(X.T-0.5*(muHealthy+muCancer))).flatten()
yp[dx<0]=2

Hits=np.shape(np.where((y==yp))[0])[0]
TP=np.shape(np.where((y[ixHealthy]==yp[ixHealthy]))[0])[0]
TN=np.shape(np.where((y[ixCancer]==yp[ixCancer]))[0])[0]
FP=np.shape(np.where((y[ixCancer]!=yp[ixCancer]))[0])[0]
FN=np.shape(np.where((y[ixHealthy]!=yp[ixHealthy]))[0])[0]

SS=TP/(TP+FN)
SP=TN/(TN+FP)
PR=TP/(TP+FP)
F1Score=2*(PR*SS)/(PR+SS)
AC=(TN+TP)/(TP+TN+FP+FN)


print("Sensitivity(%)="+str(SS*100))
print("Specificity(%)="+str(SP*100))
print("Precision(%)="+str(PR*100))
print("F1Score(%)="+str(F1Score*100))
print("Accuracy(%)="+str(AC*100))



[[ 1.05296049 -0.28464934]
 [-0.28464934  1.32348936]]


ValueError: matmul: Input operand 0 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)

### LDA FISHER

In [None]:
# Converter X para NumPy para evitar erro de indexação
X_np = x_train.to_numpy()
y_np = y_train.to_numpy()

# Índices das classes (1=Healthy, 2=With Cancer)
ixHealthy = np.where(y_np == 1)[0]
ixCancer = np.where(y_np == 2)[0]

# Médias de cada classe (vetores coluna)
muHealthy = np.array([np.mean(X_np[ixHealthy, :], axis=0)]).T
muCancer = np.array([np.mean(X_np[ixCancer, :], axis=0)]).T

# Matrizes de dispersão intra-classe
S1 = (X_np[ixHealthy, :].T - muHealthy) @ (X_np[ixHealthy, :].T - muHealthy).T
S2 = (X_np[ixCancer, :].T - muCancer) @ (X_np[ixCancer, :].T - muCancer).T

# Dispersão total dentro das classes
Sw = S1 + S2
SwInv = np.linalg.inv(Sw)

# Vetor discriminante de Fisher
w = SwInv @ (muHealthy - muCancer)
w = w / np.linalg.norm(w)  # normalização

# Projetar os dados originais nessa direção
Xp = X_np @ w
Xp = Xp.squeeze()

# Criar rótulos legíveis
y_labels = y_train.replace({1: "Healthy", 2: "With Cancer"})

# Plot 1D (linha horizontal)
fig = px.scatter(
    x=Xp,
    y=np.zeros_like(Xp),
    color=y_labels,
    labels=dict(x="Fisher LDA1", y="", color="Classification"),
    title="Fisher Linear Discriminant - Projeção 1D das Classes"
)

b = 0.5 * (w.T @ (muHealthy + muCancer))

# Adicionar linha de separação
fig.add_vline(x=b.squeeze(), line=dict(color='gray', dash='dash', width=3), name='Decision boundary')

fig.update_layout(
    title="Fisher LDA - Projeção 1D das Classes",
    xaxis_title="Fisher Linear Discriminant (wᵗx)",
    yaxis=dict(showticklabels=False, title=""),

)

fig.update_layout(height=400)
fig.show()

