### Imports

In [60]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


### Recolha de dados

In [61]:
# 64 patients with breast cancer and 52 healthy controls, 116 in total
# 1 - paciente sem cancro, 2 - paciente com cancro

file = pd.read_excel("../Data/dataR2.XLSX")
DataSet = pd.DataFrame(data=file.dropna()) #Remove NaN because of initial empty lines

# Find the indexes where the differente class samples are located
ixHealthy = np.where(DataSet['Classification'] == 1)
ixWithCancer = np.where(DataSet['Classification'] == 2)

# Definir classes
Classes = ["healthy","withCancer"]


#DataSet['Classification'].iloc[ixHealthy] = Classes[0] 
#DataSet['Classification'].iloc[ixWithCancer] = Classes[1] 

# Partição
x = DataSet.iloc[:, :-1] # todas menos a última coluna

y = DataSet.iloc[:, -1] # apenas a última coluna


x_train, x_testInicial, y_train, y_testInicial = train_test_split( # Partições entre train e test
    x, y,
    test_size=0.2,       # 20% dos dados para teste
    random_state=67,     # garante reprodutibilidade
    shuffle=True,
    stratify = y           # mantém a proporção das classes TODO: se calhar tirar depois
)


x_val, x_test, y_val, y_test = train_test_split( # Partições entre test e validation
    x_testInicial, y_testInicial,
    test_size=0.5,
    random_state=67,
    stratify=y_testInicial  # mantém a proporção das classes TODO: se calhar tirar depois
)


ixHealthyTrain = np.where(y_train == 1)
ixWithCancerTrain = np.where(y_train == 2)



### Relevancy test

In [62]:
# Kruskall

X= x.to_numpy()
X=X.astype(float)

fnames=x.columns

Hs={}
for i in range(np.shape(X)[1]):
    st=stats.kruskal(X[ixHealthy,i].flatten(),X[ixWithCancer,i].flatten())
    Hs[fnames[i]]=st.statistic


Hs = sorted(Hs.items(), key=lambda x: x[1],reverse=True)  

print("Ranked features")

for f in Hs:
    print(f[0]+"-->"+str(f[1]))

#print(X)


Ranked features
Glucose-->23.91957997825514
Resistin-->9.699241863905343
HOMA-->8.887111686390483
Insulin-->4.931023337417101
BMI-->1.6374345259560847
Age-->0.5053505863773902
MCP.1-->0.4512315968813051
Adiponectin-->0.08986720937572039
Leptin-->0.004437869822481844


### Redundancy test

In [63]:

# X = np.array([x[h[0]] for h in Hs])

# # Nomes das features
# labels = [h[0] for h in Hs]

# corrMat=np.corrcoef(X)
# fig = px.imshow(corrMat, text_auto=True,labels=dict(x="Features", y="Features", color="Correlation"),
#                 x=labels,
#                 y=labels,
#                 width=800, height=800,color_continuous_scale=px.colors.sequential.gray)
# fig.show()


firstFive = Hs[:]
corr = x[[f[0] for f in firstFive]].corr()
print(corr)


# HOMA - Insulin 0.932 Muito alta -> redundância clara


              Glucose  Resistin      HOMA   Insulin       BMI       Age  \
Glucose      1.000000  0.291327  0.696212  0.504653  0.138845  0.230106   
Resistin     0.291327  1.000000  0.231101  0.146731  0.195350  0.002742   
HOMA         0.696212  0.231101  1.000000  0.932198  0.114480  0.127033   
Insulin      0.504653  0.146731  0.932198  1.000000  0.145295  0.032495   
BMI          0.138845  0.195350  0.114480  0.145295  1.000000  0.008530   
Age          0.230106  0.002742  0.127033  0.032495  0.008530  1.000000   
MCP.1        0.264879  0.366474  0.259529  0.174356  0.224038  0.013462   
Adiponectin -0.122121 -0.252363 -0.056337 -0.031296 -0.302735 -0.219813   
Leptin       0.305080  0.256234  0.327210  0.301462  0.569593  0.102626   

                MCP.1  Adiponectin    Leptin  
Glucose      0.264879    -0.122121  0.305080  
Resistin     0.366474    -0.252363  0.256234  
HOMA         0.259529    -0.056337  0.327210  
Insulin      0.174356    -0.031296  0.301462  
BMI          0

In [64]:
ixHealthCancer=np.concatenate((ixHealthy[0],ixWithCancer[0]))
y=DataSet['Classification'].to_numpy()[ixHealthCancer]


roc_auc=np.zeros(fnames.shape)
i=0
for f in fnames:#Go along features
    fpr, tpr, _= roc_curve(y,DataSet[f].to_numpy()[ixHealthCancer],pos_label=2)
    
    figR = go.Figure()
    figR.add_scatter(x=fpr, y=tpr,mode='lines+markers')
    figR.update_layout(autosize=False,width=700,height=700,title=dict(text=f))
    figR.update_xaxes(title_text="1-SP",range=[-0.01, 1.01])
    figR.update_yaxes(title_text="SS",range=[-0.01, 1.01])
    

    roc_auc[i] = auc(fpr, tpr)#Compute area under the ROC curve
    
    figR.add_annotation(x=0.5, y=0.5,
            text="AUC: "+str(roc_auc[i]),
            showarrow=False,
            yshift=10)
    figR.show()
    i=i+1

sortIx=np.flip(np.argsort(roc_auc))#Sort using AUC
print("Sorting accourding to ROC-AUC:")
for i in sortIx:
    print(fnames[i]+"-->"+str(roc_auc[i]))
print("\n")
print("Sorting accourding to Kruskall-Wallis:")

for f in Hs:
    print(f[0]+"-->"+str(f[1]))

Sorting accourding to ROC-AUC:
Glucose-->0.7645733173076923
Resistin-->0.6685697115384615
HOMA-->0.661358173076923
Insulin-->0.6201923076923077
MCP.1-->0.5363581730769231
Adiponectin-->0.5162259615384616
Leptin-->0.5036057692307693
Age-->0.4615384615384615
BMI-->0.43073918269230765


Sorting accourding to Kruskall-Wallis:
Glucose-->23.91957997825514
Resistin-->9.699241863905343
HOMA-->8.887111686390483
Insulin-->4.931023337417101
BMI-->1.6374345259560847
Age-->0.5053505863773902
MCP.1-->0.4512315968813051
Adiponectin-->0.08986720937572039
Leptin-->0.004437869822481844


### PCA

In [None]:
#Normalize
X=(x_train-np.mean(x_train,axis=0))/np.std(x_train,axis=0)

pca = PCA()
pca.fit(X)
# #PCA eigenvalues/Explained variance
# print("PCA eigenvalues/Explained variance")
# print(pca.explained_variance_)
# print("Sum of eigenvalues="+str(np.sum(pca.explained_variance_)))
# #PCA eigenvectors/Principal components
# print("PCA eigenvectors/Principal components")
# W=pca.components_.T
# print(W)

fig = px.scatter(
    x=np.arange(1, len(pca.explained_variance_) + 1),
    y=pca.explained_variance_,
    labels=dict(x="PC", y="Explained Variance")
)

fig.add_hline(y=1,line_width=3, line_dash="dash", line_color="red")
fig.update_traces(marker_size=10)
fig.show()

print("Variance (%) retained accourding to Kaiser: "+str(pca.explained_variance_[0]**2/(np.sum(pca.explained_variance_**2))*100))
print("Variance (%) retained accourding to Scree: "+str(np.sum(pca.explained_variance_[0:6]**2)/(np.sum(pca.explained_variance_**2))*100))


pca2=PCA(n_components=1)

X1D=pca2.fit_transform(X)
#print(np.shape(X1D))

#Plot projected data
fig = px.scatter(x=X1D[:, 0],y=np.zeros(np.shape(X1D)[0]),color=y_train.replace({1: "Healthy", 2: "With Cancer"}),labels=dict(x="PC1", y="", color="Classification"))

fig.update_traces(marker_size=8)
fig.update_xaxes
fig.show()



ValueError: All arguments should have the same length. The length of argument `y` is 9, whereas the length of  previously-processed arguments ['x'] is 10

### MDC

In [None]:
NHealthy=x_train['Resistin'].to_numpy()[ixHealthyTrain]
NCancer=x_train['Resistin'].to_numpy()[ixWithCancerTrain]

gluHealthy=(x_train['Glucose'].to_numpy())[ixHealthyTrain]
gluCancer=(x_train['Glucose'].to_numpy())[ixWithCancerTrain]


muHealthy=np.array([[np.mean(NHealthy),np.mean(gluHealthy)]]).T
muCancer=np.array([[np.mean(NCancer),np.mean(gluCancer)]]).T

print("gHealthy(x)="+str(muHealthy.T)+"x-0.5"+str(muHealthy.T@muHealthy))
print("gCancer(x)="+str(muCancer.T)+"x-0.5"+str(muCancer.T@muCancer)+"\n")


X1=np.array([NHealthy,gluHealthy]).T
X2=np.array([NCancer,gluCancer]).T
X=np.concatenate((X1,X2),axis=0)

y = np.concatenate((y_train.to_numpy()[ixHealthyTrain],y_train.to_numpy()[ixWithCancerTrain]))


yp=np.ones(np.shape(y))

dx=((muHealthy-muCancer).T@(X.T-0.5*(muHealthy+muCancer))).flatten()
yp[dx<0]=2

Hits=np.shape(np.where((y==yp))[0])[0]
TP=np.shape(np.where((y[ixHealthyTrain]==yp[ixHealthyTrain]))[0])[0]
TN=np.shape(np.where((y[ixWithCancerTrain]==yp[ixWithCancerTrain]))[0])[0]
FP=np.shape(np.where((y[ixWithCancerTrain]!=yp[ixWithCancerTrain]))[0])[0]
FN=np.shape(np.where((y[ixHealthyTrain]!=yp[ixHealthyTrain]))[0])[0]

SS=TP/(TP+FN)
SP=TN/(TN+FP)
PR=TP/(TP+FP)
F1Score=2*(PR*SS)/(PR+SS)
AC=(TN+TP)/(TP+TN+FP+FN)


print("Sensitivity(%)="+str(SS*100))
print("Specificity(%)="+str(SP*100))
print("Precision(%)="+str(PR*100))
print("F1Score(%)="+str(F1Score*100))
print("Accuracy(%)="+str(AC*100))


fig=go.Figure()
fig.add_trace(go.Scatter(x=NHealthy,y=gluHealthy,name='Healthy'))
#fig.update_traces(marker=dict(color='green', size=10))
fig.add_trace(go.Scatter(x=NCancer,y=gluCancer,name='Cancer'))
fig.update_traces(marker=dict(size=10))


ixFP = np.where((y[ixWithCancerTrain] != yp[ixWithCancerTrain]))[0]
ixFN = np.where((y[ixHealthyTrain] != yp[ixHealthyTrain]))[0]


fig.add_trace(go.Scatter(x=X[ixFN,0],y=X[ixFN,1],name='False Healthy',marker_size=15,
                         marker_symbol="circle-open",marker=dict(color="red",
                                                                 line=dict(width=3,color='red'))))
fig.add_trace(go.Scatter(x=X[ixFP,0],y=X[ixFP,1],name='False Cancer',marker_size=15,
                         marker_symbol="circle-open",marker=dict(color="blue",
                                                                 line=dict(width=3,color='blue'))))


fig.add_trace(go.Scatter(x=muHealthy[0],y=muHealthy[1], marker_size=20,
                         marker_symbol='x',name='Super mean',
                         marker=dict(color="green",line=dict(width=1,
                                        color='black'))))
fig.add_trace(go.Scatter(x=muCancer[0],y=muCancer[1], marker_size=20,
                         marker_symbol='x',name='Average mean',
                         marker=dict(color="yellow",line=dict(width=1,
                                        color='black'))))
fig.add_trace(go.Scatter(x=(muHealthy[0]+muCancer[0])/2,y=(muHealthy[1]+muCancer[1])/2, marker_size=20,
                         marker_symbol="circle-dot",name='Mean of the means',
                         marker=dict(color="purple",line=dict(width=1,
                                        color='black'))))

fig.update_traces(mode='markers')
fig.add_trace(go.Scatter(x=[muHealthy[0][0],muCancer[0][0]],y=[muHealthy[1][0],muCancer[1][0]],
                         mode='lines', line=dict(color="black",width=4),name='Inter mean segment'))


W=(muHealthy-muCancer)
b=-0.5*(muHealthy-muCancer).T@(muHealthy+muCancer)

x1=np.arange(20,130,1)

x2=-(W[0,0]/W[1,0])*x1-b/W[1,0]
fig.add_trace(go.Scatter(x=x1,y=x2.flatten(),
                         mode='lines', line=dict(dash='dash',color="gray",width=4),name='Hyperplane'))





fig.update_xaxes(title_text='Resistin')
fig.update_yaxes(title_text='Glucose')
fig.update_layout(
    autosize=False,
    width=900,
    height=800)
fig.show()



### LDA

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
# prepare transform on dataset
lda.fit(X,y)
# apply transform to dataset
transformed = lda.transform(X)

# Converter classes para nomes legíveis (1 → Healthy, 2 → With Cancer)
y_labels = DataSet['Classification'].replace({1: "Healthy", 2: "With Cancer"})

#Plot transformed data
# Plot 1D estilo linha reta
fig = px.scatter(
    x=transformed.squeeze(),
    y=np.zeros(transformed.shape[0]),
    color=y_labels,
    labels=dict(x="LDA1", y="", color="Classification"),
    title="LDA - Projeção 1D das Classes"
)

fig.show()

### LDA FISHER

In [None]:
# Converter X para NumPy para evitar erro de indexação
X_np = X.values

# Índices das classes (1=Healthy, 2=With Cancer)
ixHealthy = np.where(y == 1)[0]
ixCancer = np.where(y == 2)[0]

# Médias de cada classe (vetores coluna)
muHealthy = np.array([np.mean(X_np[ixHealthy, :], axis=0)]).T
muCancer = np.array([np.mean(X_np[ixCancer, :], axis=0)]).T

# Matrizes de dispersão intra-classe
S1 = (X_np[ixHealthy, :].T - muHealthy) @ (X_np[ixHealthy, :].T - muHealthy).T
S2 = (X_np[ixCancer, :].T - muCancer) @ (X_np[ixCancer, :].T - muCancer).T

# Dispersão total dentro das classes
Sw = S1 + S2
SwInv = np.linalg.inv(Sw)

# Vetor discriminante de Fisher
w = SwInv @ (muHealthy - muCancer)
w = w / np.linalg.norm(w)  # normalização

# Projetar os dados originais nessa direção
Xp = X_np @ w
Xp = Xp.squeeze()

# Criar rótulos legíveis
y_labels = DataSet['Classification'].replace({1: "Healthy", 2: "With Cancer"})

# Plot 1D (linha horizontal)
fig = px.scatter(
    x=Xp,
    y=np.zeros_like(Xp),
    color=y_labels,
    labels=dict(x="Fisher LDA1", y="", color="Classification"),
    title="Fisher Linear Discriminant - Projeção 1D das Classes"
)

# Adicionar linha de separação
fig.add_vline(x=b.squeeze(), line=dict(color='gray', dash='dash', width=3), name='Decision boundary')

fig.update_layout(
    title="Fisher LDA - Projeção 1D das Classes",
    xaxis_title="Fisher Linear Discriminant (wᵗx)",
    yaxis=dict(showticklabels=False, title=""),

)

fig.update_layout(height=400)
fig.show()
