### 1. Imports

In [15]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import mixture
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
import matplotlib.pyplot as plt



### 2. Recolha de dados

In [2]:

# 64 patients with breast cancer and 52 healthy controls, 116 in total
# 1 - paciente sem cancro (Healthy), 2 - paciente com cancro (With Cancer)

file = pd.read_excel("../Data/dataR2.XLSX")
DataSet = pd.DataFrame(data=file.dropna())  # Remove NaN por linhas vazias iniciais

# Features e label
x = DataSet.iloc[:, :-1].copy()
y = DataSet.iloc[:, -1].copy()

print("Shape:", x.shape)
print("\nClass balance:")
print(y.value_counts().rename({1: "Healthy", 2: "With Cancer"}))


Shape: (116, 9)

Class balance:
With Cancer    64
Healthy        52
Name: Classification, dtype: int64


### 3. Data partitioning (Train / Validation / Test)

In [3]:
X_train, X_temp, Y_train, Y_temp = train_test_split( # Partições entre train e test
    x, y,
    test_size = 0.2,       # 20% dos dados para teste
    random_state = 67,     # garante reprodutibilidade (seed)
    shuffle = True,
    stratify = y          # mantém a proporção das classes
)

X_val, X_test, Y_val, Y_test = train_test_split( # Partições entre test e validation
    X_temp, Y_temp,
    test_size = 0.5,
    random_state = 67,
    stratify = Y_temp  # mantém a proporção das classes
)

ixHealthyTrain = np.where(Y_train == 1)
ixWithCancerTrain = np.where(Y_train == 2)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

Train: (92, 9) Val: (12, 9) Test: (12, 9)


### 4. Feature Relevancy Test (Kruskal & ROC-AUC)

In [4]:

X = X_train.to_numpy().astype(float)
fnames = X_train.columns

# Kruskal-Wallis test
Hs = {}
for i in range(np.shape(X)[1]):
    st = stats.kruskal(X[ixHealthyTrain, i].flatten(), X[ixWithCancerTrain, i].flatten())
    Hs[fnames[i]] = st.statistic

Hs = sorted(Hs.items(), key=lambda x: x[1], reverse=True)

print("Ranked features by Kruskal-Wallis:")
print("==========================")
for f in Hs:
    print(f[0], "-->", f[1])

#=========================

# ROC-AUC computation (using only training data)
y_train_arr = Y_train.to_numpy()
roc_auc = np.zeros(len(fnames))

for i, f in enumerate(fnames):
    x_train_arr = X_train[f].to_numpy()
    fpr, tpr, _ = roc_curve(y_train_arr, x_train_arr, pos_label=2)
    roc_auc[i] = auc(fpr, tpr)

# Sort and display
sortIx = np.flip(np.argsort(roc_auc))
print("\nRanked by ROC-AUC:")
print("==========================")
for i in sortIx:
    print(fnames[i], "-->", roc_auc[i])


Ranked features by Kruskal-Wallis:
Glucose --> 16.960636026690125
Resistin --> 8.243640178337273
HOMA --> 7.028972092377501
Insulin --> 4.448765747437663
BMI --> 2.4315246355703573
MCP.1 --> 0.9875036349333824
Adiponectin --> 0.19007728976720273
Leptin --> 0.14515357677294105
Age --> 0.06324105726834488

Ranked by ROC-AUC:
Glucose --> 0.750597800095648
Resistin --> 0.6747967479674797
HOMA --> 0.6614060258249641
Insulin --> 0.6284074605451937
MCP.1 --> 0.5604973696795792
Age --> 0.48469631755141074
Leptin --> 0.47680535628885695
Adiponectin --> 0.4734576757532281
BMI --> 0.40506934481109513


### 5. Feature Selection (Kruskal + ROC-AUC thresholds)

In [5]:

Hs_dict = dict(Hs)
auc_dict = dict(zip(fnames, roc_auc))

mean_H = np.nanmean(list(Hs_dict.values()))
mean_auc = np.nanmean(list(auc_dict.values()))

print(f"\nMean H (Kruskal): {mean_H:.3f}")
print(f"Mean AUC: {mean_auc:.3f}\n")

selected_features = [f for f in fnames if (Hs_dict.get(f, 0) > mean_H) and (auc_dict.get(f, 0) > mean_auc)]

print("Selected features above mean in both criteria:")
for f in selected_features:
    print(f)

X_train = X_train[selected_features]

# Aplicar as mesmas features aos restantes conjuntos
X_val = X_val[selected_features]
X_test = X_test[selected_features]

print(f"\nOriginal features: {len(fnames)}")
print(f"Selected features: {X_train.shape[1]}")





Mean H (Kruskal): 4.500
Mean AUC: 0.568

Selected features above mean in both criteria:
Glucose
HOMA
Resistin

Original features: 9
Selected features: 3


### 6. Redundancy Test (Correlation)

In [None]:

Features = Hs[:]
corr_full = x.corr()
corr_after = X_train.corr()

print("Correlation before feature selection:")
print("==========================")
print(corr_full)

print("\nCorrelation after feature selection:")
print("==========================")
print(corr_after)





Correlation before feature selection:
              Glucose  Resistin      HOMA   Insulin       BMI     MCP.1  \
Glucose      1.000000  0.291327  0.696212  0.504653  0.138845  0.264879   
Resistin     0.291327  1.000000  0.231101  0.146731  0.195350  0.366474   
HOMA         0.696212  0.231101  1.000000  0.932198  0.114480  0.259529   
Insulin      0.504653  0.146731  0.932198  1.000000  0.145295  0.174356   
BMI          0.138845  0.195350  0.114480  0.145295  1.000000  0.224038   
MCP.1        0.264879  0.366474  0.259529  0.174356  0.224038  1.000000   
Adiponectin -0.122121 -0.252363 -0.056337 -0.031296 -0.302735 -0.200694   
Leptin       0.305080  0.256234  0.327210  0.301462  0.569593  0.014009   
Age          0.230106  0.002742  0.127033  0.032495  0.008530  0.013462   

             Adiponectin    Leptin       Age  
Glucose        -0.122121  0.305080  0.230106  
Resistin       -0.252363  0.256234  0.002742  
HOMA           -0.056337  0.327210  0.127033  
Insulin        -0.03129

### 7. Principal Component Analysis (PCA)

In [6]:

X_norm = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
pca = PCA()
pca.fit(X_norm)

# Scree plot (variância explicada)
fig = px.scatter(
    x=np.arange(1, len(pca.explained_variance_) + 1),
    y=pca.explained_variance_,
    labels=dict(x="PC", y="Explained Variance"),
    title="PCA Scree Plot"
)
fig.add_hline(y=1, line_width=3, line_dash="dash", line_color="red")
fig.update_traces(marker_size=10)
fig.show()


# Variância retida (métodos Kaiser e Scree)
var_total = np.sum(pca.explained_variance_**2)
var_kaiser = (pca.explained_variance_[0]**2 / var_total) * 100
var_scree = (np.sum(pca.explained_variance_[:6]**2) / var_total) * 100

print(f"Variance retained (Kaiser): {var_kaiser:.2f}%")
print(f"Variance retained (Scree, first 6 PCs): {var_scree:.2f}%")


labels = Y_train.replace({1: "Healthy", 2: "With Cancer"})

# 1D projection (PC1)
X_pca_1d = pca.transform(X_norm)[:, :1]
fig = px.scatter(
    x=X_pca_1d[:, 0],
    y=np.zeros_like(X_pca_1d[:, 0]),
    color=labels,
    labels=dict(x="PC1", y="", color="Classification"),
    title="PCA Projection - First Principal Component"
)
fig.update_traces(marker_size=8)
fig.show()

# 2D projection (PC1 vs PC2)
X_pca_2d = pca.transform(X_norm)[:, :2]
PC1, PC2 = X_pca_2d[:, 0], X_pca_2d[:, 1]

fig = px.scatter(
    x=PC1, y=PC2,
    color=labels,
    labels=dict(x="PC1", y="PC2", color="Classification"),
    title="PCA Projection - First Two Principal Components"
)
fig.update_traces(marker_size=8)
fig.show()



Variance retained (Kaiser): 83.34%
Variance retained (Scree, first 6 PCs): 100.00%


### 8. Linear Discriminant Analysis (LDA)

In [7]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(X_norm, Y_train)
X_lda = lda.transform(X_norm)
LD1 = X_lda[:, 0]

y_labels = Y_train.replace({1: "Healthy", 2: "With Cancer"})
fig = px.scatter(x=LD1, y=np.zeros_like(LD1), color=y_labels, title="LDA Projection (1D)")
fig.show()

### 9. Minimum Distance Classifier (MDC)

In [8]:
y_train_np = Y_train.to_numpy()

ix_healthy = np.where(y_train_np == 1)[0]
ix_cancer  = np.where(y_train_np == 2)[0]

PC1 = np.array(PC1)
PC2 = np.array(PC2)

PC1_healthy, PC2_healthy = PC1[ix_healthy], PC2[ix_healthy]
PC1_cancer,  PC2_cancer  = PC1[ix_cancer],  PC2[ix_cancer]

mu_healthy = np.array([[np.mean(PC1_healthy), np.mean(PC2_healthy)]]).T
mu_cancer  = np.array([[np.mean(PC1_cancer),  np.mean(PC2_cancer)]]).T

print(f"gHealthy(x) = {mu_healthy.T}x - 0.5{mu_healthy.T @ mu_healthy}")
print(f"gCancer(x)  = {mu_cancer.T} x - 0.5{mu_cancer.T @ mu_cancer}\n")

X_healthy = np.vstack((PC1_healthy, PC2_healthy)).T
X_cancer  = np.vstack((PC1_cancer,  PC2_cancer)).T
X_all = np.vstack((X_healthy, X_cancer))
y_all = np.concatenate((np.ones(len(ix_healthy)), np.full(len(ix_cancer), 2)))

dx = ((mu_healthy - mu_cancer).T @ (X_all.T - 0.5 * (mu_healthy + mu_cancer))).flatten()
y_pred = np.where(dx < 0, 2, 1)

TP = np.sum((y_all == 1) & (y_pred == 1))
TN = np.sum((y_all == 2) & (y_pred == 2))
FP = np.sum((y_all == 2) & (y_pred == 1))
FN = np.sum((y_all == 1) & (y_pred == 2))

SS = TP / (TP + FN)
SP = TN / (TN + FP)
PR = TP / (TP + FP)
F1 = 2 * (PR * SS) / (PR + SS)
AC = (TP + TN) / len(y_all)

print("=== Euclidean MDC (PCA1 vs PCA2) ===")
print(f"Sensitivity(%) = {SS*100:.2f}")
print(f"Specificity(%) = {SP*100:.2f}")
print(f"Precision(%)   = {PR*100:.2f}")
print(f"F1-Score(%)    = {F1*100:.2f}")
print(f"Accuracy(%)    = {AC*100:.2f}")

fig = go.Figure()

fig.add_trace(go.Scatter(x=PC1_healthy, y=PC2_healthy, name='Healthy', mode='markers'))
fig.add_trace(go.Scatter(x=PC1_cancer,  y=PC2_cancer,  name='Cancer',  mode='markers'))

ix_fp = np.where((y_all == 2) & (y_pred == 1))[0]
ix_fn = np.where((y_all == 1) & (y_pred == 2))[0]
fig.add_trace(go.Scatter(x=X_all[ix_fn,0], y=X_all[ix_fn,1],
                         name='False Healthy', mode='markers',
                         marker=dict(color='red', symbol='circle-open', size=13)))
fig.add_trace(go.Scatter(x=X_all[ix_fp,0], y=X_all[ix_fp,1],
                         name='False Cancer', mode='markers',
                         marker=dict(color='blue', symbol='circle-open', size=13)))

fig.add_trace(go.Scatter(x=[mu_healthy[0,0]], y=[mu_healthy[1,0]], name='Healthy mean',
                         mode='markers', marker=dict(color='green', size=14, symbol='x')))
fig.add_trace(go.Scatter(x=[mu_cancer[0,0]], y=[mu_cancer[1,0]], name='Cancer mean',
                         mode='markers', marker=dict(color='orange', size=14, symbol='x')))

W = (mu_healthy - mu_cancer)
b = -0.5 * (mu_healthy - mu_cancer).T @ (mu_healthy + mu_cancer)
x1 = np.linspace(min(PC1) - 5, max(PC1) + 5, 100)
x2 = -(W[0,0]/W[1,0])*x1 - b/W[1,0]
fig.add_trace(go.Scatter(x=x1, y=x2.flatten(), mode='lines',
                         line=dict(color='gray', dash='dash'), name='Decision boundary'))

fig.update_layout(
    title="Euclidean MDC - PCA1 vs PCA2",
    xaxis_title="PC1", yaxis_title="PC2",
)
fig.show()

# =========================

y_train_np = Y_train.to_numpy()
ix_healthy = np.where(y_train_np == 1)[0]
ix_cancer  = np.where(y_train_np == 2)[0]

LD1 = LD1.flatten()
LD1_healthy = LD1[ix_healthy]
LD1_cancer  = LD1[ix_cancer]

mu_healthy = np.mean(LD1_healthy)
mu_cancer  = np.mean(LD1_cancer)

print(f"gHealthy(x) = {mu_healthy:.3f}x - 0.5({mu_healthy**2:.3f})")
print(f"gCancer(x)  = {mu_cancer:.3f}x - 0.5({mu_cancer**2:.3f})\n")

decision_boundary = 0.5 * (mu_healthy + mu_cancer)
dx = (mu_healthy - mu_cancer) * (LD1 - decision_boundary)
y_pred = np.where(dx < 0, 2, 1)

TP = np.sum((y_train_np == 1) & (y_pred == 1))
TN = np.sum((y_train_np == 2) & (y_pred == 2))
FP = np.sum((y_train_np == 2) & (y_pred == 1))
FN = np.sum((y_train_np == 1) & (y_pred == 2))

SS = TP / (TP + FN)
SP = TN / (TN + FP)
PR = TP / (TP + FP)
F1 = 2 * (PR * SS) / (PR + SS)
AC = (TP + TN) / len(y_train_np)

print("=== Euclidean MDC (LDA1) ===")
print(f"Sensitivity(%) = {SS*100:.2f}")
print(f"Specificity(%) = {SP*100:.2f}")
print(f"Precision(%)   = {PR*100:.2f}")
print(f"F1-Score(%)    = {F1*100:.2f}")
print(f"Accuracy(%)    = {AC*100:.2f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=LD1_healthy, y=np.zeros_like(LD1_healthy),
                         mode='markers', name='Healthy', marker=dict(color='green', size=9)))
fig.add_trace(go.Scatter(x=LD1_cancer, y=np.zeros_like(LD1_cancer),
                         mode='markers', name='Cancer', marker=dict(color='orange', size=9)))
fig.add_trace(go.Scatter(x=[mu_healthy], y=[0], name='Healthy mean',
                         mode='markers', marker=dict(color='green', symbol='x', size=12)))
fig.add_trace(go.Scatter(x=[mu_cancer], y=[0], name='Cancer mean',
                         mode='markers', marker=dict(color='yellow', symbol='x', size=12)))
fig.add_trace(go.Scatter(x=[decision_boundary], y=[0], name='Decision boundary',
                         mode='markers', marker=dict(color='purple', size=10, symbol='circle')))

fig.add_trace(go.Scatter(
    x=[mu_healthy, mu_cancer], y=[0, 0],
    mode='lines', line=dict(color="black", width=3), name='Inter mean segment'
))

fig.update_layout(
    title="Euclidean MDC - LDA1",
    xaxis_title="LDA1",
    yaxis_visible=False,
    showlegend=True
)
fig.show()

gHealthy(x) = [[-0.5726661  -0.03638699]]x - 0.5[[0.32927047]]
gCancer(x)  = [[0.46037863 0.02925229]] x - 0.5[[0.21280418]]

=== Euclidean MDC (PCA1 vs PCA2) ===
Sensitivity(%) = 90.24
Specificity(%) = 52.94
Precision(%)   = 60.66
F1-Score(%)    = 72.55
Accuracy(%)    = 69.57


gHealthy(x) = 0.478x - 0.5(0.228)
gCancer(x)  = -0.384x - 0.5(0.147)

=== Euclidean MDC (LDA1) ===
Sensitivity(%) = 85.37
Specificity(%) = 56.86
Precision(%)   = 61.40
F1-Score(%)    = 71.43
Accuracy(%)    = 69.57


### 10. Fisher Linear Discriminant

In [9]:
X_np = X_train.to_numpy()
y_np = Y_train.to_numpy()

ixHealthy = np.where(y_np == 1)[0]
ixCancer = np.where(y_np == 2)[0]

muHealthy = np.array([np.mean(X_np[ixHealthy, :], axis=0)]).T
muCancer = np.array([np.mean(X_np[ixCancer, :], axis=0)]).T

S1 = (X_np[ixHealthy, :].T - muHealthy) @ (X_np[ixHealthy, :].T - muHealthy).T
S2 = (X_np[ixCancer, :].T - muCancer) @ (X_np[ixCancer, :].T - muCancer).T

Sw = S1 + S2
SwInv = np.linalg.inv(Sw)

w = SwInv @ (muHealthy - muCancer)
w = w / np.linalg.norm(w)  # normalização

Xp = X_np @ w
Xp = Xp.squeeze()

y_labels = Y_train.replace({1: "Healthy", 2: "With Cancer"})

fig = px.scatter(
    x=Xp,
    y=np.zeros_like(Xp),
    color=y_labels,
    labels=dict(x="Fisher LDA1", y="", color="Classification"),
    title="Fisher Linear Discriminant - Projeção 1D das Classes"
)

b = 0.5 * (w.T @ (muHealthy + muCancer))

fig.add_vline(x=b.squeeze(), line=dict(color='gray', dash='dash', width=3), name='Decision boundary')

fig.update_layout(
    title="Fisher LDA - Projeção 1D das Classes",
    xaxis_title="Fisher Linear Discriminant (wᵗx)",
    yaxis=dict(showticklabels=False, title=""),

)

fig.update_layout(height=400)
fig.show()



### 11. Not Naive Bayes Classifier (não é naive porque usa covariância completa??)

In [None]:

    #Fit bayes for the training data
def fitBayes(Xtr,ytr):
    ix1=np.where(ytr==1)[0]
    ix2=np.where(ytr==2)[0]
    
    #Compute priors
    Pw1=ix1.shape[0]/(ix1.shape[0]+ix2.shape[0])
    Pw2=ix2.shape[0]/(ix1.shape[0]+ix2.shape[0])
    
    # Estimate gaussian conditional PDFs --> fit two Gaussian Mixture Models
    clf1 = mixture.GaussianMixture(n_components=1)
    clf2 = mixture.GaussianMixture(n_components=1)
    mod1=clf1.fit(Xtr[ix1,:])
    mod2=clf2.fit(Xtr[ix2,:])
    
    return {'mean1':mod1.means_.squeeze(),'mean2':mod2.means_.squeeze(),
            'cov1':mod1.covariances_[0],'cov2':mod2.covariances_[0],
            'Pw1':Pw1,'Pw2':Pw2}


#Function to evaluate a gaussian PDF acourding to 
def pdfGauss(X,mean,cov):
    covInv=np.linalg.inv(cov)
    dim=cov.shape[0]
    val=np.array([])
    for i in range(X.shape[0]):
        dist=((np.array([X[i,:]-mean]))@covInv@(np.array([X[i,:]-mean])).T).squeeze()
        multivariate_pdf = np.exp(-0.5*dist)/((2*np.pi)**(dim/2)*np.linalg.det(cov)**0.5)
        val=np.append(val, multivariate_pdf)
    return np.array([val]).T



#Apply a trained bayes classifier
def useBayes(Xte,model):
    Pw1X = pdfGauss(Xte,model['mean1'],model['cov1'])*model['Pw1']
    Pw2X = pdfGauss(Xte,model['mean2'],model['cov2'])*model['Pw2']
    
    return ((-np.sign(Pw1X-Pw2X))*0.5+1.5).squeeze() #1-W1, 2-W2
    # Adicionado o sinal negativo para ter em conta a approach original do toolbox em matlab


def plotDecision(X,y,model,granularity=0.01,classContours=False,f1name="f1",f2name="f2"):
    #Identify class indexes
    ix1=np.where(y==1)[0]
    ix2=np.where(y==2)[0]
    
    #Create vectors for plotting
    xRange=np.arange(X[:,0].min(),X[:,0].max()+0.5,granularity)
    yRange=np.arange(X[:,1].min(),X[:,1].max(),granularity)
    
    Xp, Yp = np.meshgrid(xRange, yRange)
    xy = np.column_stack([Xp.flat, Yp.flat])
    XX = np.array([Xp.ravel(), Yp.ravel()]).T
    #Sample class one pdf, just to get contours
    Z1 = pdfGauss(XX,model['mean1'],model['cov1'])
    Z1 = Z1.reshape(Xp.shape)
    #Sample class two pdf, just to get contours
    Z2=pdfGauss(XX,model['mean2'],model['cov2'])
    Z2 = Z2.reshape(Xp.shape)
    
    #Plot samples
    fig=go.Figure()
    fig.add_trace(go.Scatter(x=X[ix1,0],y=X[ix1,1],name='1'))
    fig.add_trace(go.Scatter(x=X[ix2,0],y=X[ix2,1],name='2'))
    fig.update_traces(marker=dict(size=8))
    #Plot means
    fig.add_trace(go.Scatter(x=np.array(model['mean1'][0]),y=np.array(model['mean1'][1]), marker_size=20,
                         marker_symbol='x',name='1 mean',
                         marker=dict(color="blue",line=dict(width=1,
                                        color='black'))))
    fig.add_trace(go.Scatter(x=np.array(model['mean2'][0]),y=np.array(model['mean2'][1]), marker_size=20,
                         marker_symbol='x',name='2 mean',
                         marker=dict(color="red",line=dict(width=1,
                                        color='black'))))
    fig.update_traces(mode='markers')
    
    
    #Compute Bayes decision for the grid created by xRange and yRange
    yteP=np.zeros((yRange.shape[0],xRange.shape[0]))
    for i in range(xRange.shape[0]):
        for j in range(yRange.shape[0]):
            yteP[j,i]=useBayes(np.array([[xRange[i],yRange[j]]]),model)
    
    if classContours:
        #Add class 1 contours
        fig.add_trace(go.Contour(
            z=Z1,x=xRange,y=yRange,
            contours_coloring='lines',
            line_width=1,showscale=False,))
    
        #Add class 2 contours
        fig.add_trace(go.Contour(
            z=Z2,x=xRange,y=yRange,
            contours_coloring='lines',
            line_width=1,showscale=False,))
        
    
    #Add decision contour
    fig.add_trace(go.Contour(
        z=yteP,x=xRange,y=yRange,contours_coloring='lines',name='Decision Boundary',colorscale='Greys',
        line_width=4,showscale=False))
    fig.update_traces(ncontours=100, selector=dict(type='contour'))
    fig.update_xaxes(title_text=f1name)
    fig.update_yaxes(title_text=f2name)
    fig.update_layout(autosize=False,width=900,height=800)
    fig.show()


In [None]:
# treinar Bayes
model_bayes = fitBayes(X_train.to_numpy(), Y_train.to_numpy())

print("Bayes classifier treinado com sucesso!")
print("Mean class 1:", model_bayes['mean1'])
print("Mean class 2:", model_bayes['mean2'])

Bayes classifier treinado com sucesso!
Mean class 1: [88.82926829  1.64448088 11.99783707]
Mean class 2: [106.92156863   3.66645962  18.17847765]



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [12]:
y_pred_test = useBayes(X_test.to_numpy(), model_bayes)
Yt = Y_test.to_numpy()

TP = np.sum((Yt == 1) & (y_pred_test == 1))
TN = np.sum((Yt == 2) & (y_pred_test == 2))
FP = np.sum((Yt == 2) & (y_pred_test == 1))
FN = np.sum((Yt == 1) & (y_pred_test == 2))

SS = TP / (TP + FN)
SP = TN / (TN + FP)
PR = TP / (TP + FP)
F1 = 2 * (PR * SS) / (PR + SS)
ACC = (TP + TN) / len(Yt)

print("\n=========== BAYES CLASSIFIER (Test Set) ===========")
print(f"Sensitivity(%) = {SS*100:.2f}")
print(f"Specificity(%) = {SP*100:.2f}")
print(f"Precision(%)   = {PR*100:.2f}")
print(f"F1-Score(%)    = {F1*100:.2f}")
print(f"Accuracy(%)    = {ACC*100:.2f}")



Sensitivity(%) = 100.00
Specificity(%) = 28.57
Precision(%)   = 50.00
F1-Score(%)    = 66.67
Accuracy(%)    = 58.33


In [None]:
X2 = X_train[selected_features].to_numpy()[:, :2]   # primeiras 2 features selecionadas
model_fs = fitBayes(X2, Y_train.to_numpy())

plotDecision(X2, Y_train.to_numpy(), model_fs,
             granularity=0.1,
             f1name=selected_features[0],
             f2name=selected_features[1])



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



### 12. KNN

In [16]:

Xtr = X_train.to_numpy()
ytr = Y_train.to_numpy()

K = 5   # Podes mudar depois

knn = KNeighborsClassifier(n_neighbors=K, metric='euclidean')
knn.fit(Xtr, ytr)

Xt = X_test.to_numpy()
yt = Y_test.to_numpy()

y_pred_knn_test = knn.predict(Xt)

TP = np.sum((yt == 1) & (y_pred_knn_test == 1))
TN = np.sum((yt == 2) & (y_pred_knn_test == 2))
FP = np.sum((yt == 2) & (y_pred_knn_test == 1))
FN = np.sum((yt == 1) & (y_pred_knn_test == 2))

SS = TP / (TP + FN)
SP = TN / (TN + FP)
PR = TP / (TP + FP)
F1 = 2 * (PR * SS) / (PR + SS)
ACC = (TP + TN) / len(yt)

print("\n=========== KNN (Test Set) ===========")
print(f"Sensitivity(%) = {SS*100:.2f}")
print(f"Specificity(%) = {SP*100:.2f}")
print(f"Precision(%)   = {PR*100:.2f}")
print(f"F1-Score(%)    = {F1*100:.2f}")
print(f"Accuracy(%)    = {ACC*100:.2f}")



Sensitivity(%) = 80.00
Specificity(%) = 57.14
Precision(%)   = 57.14
F1-Score(%)    = 66.67
Accuracy(%)    = 66.67


#### TALVEZ TIRAR OU TENTAR FAZER POR MIM DEPOIS, ATÉ É UMA BOA IDEIA DO CHAT E MAYBE FAZER O PLOT AINDA

In [None]:


K_values = range(1, 16)
accs = []

for K in K_values:
    knn = KNeighborsClassifier(n_neighbors=K)
    knn.fit(Xtr, ytr)
    pred = knn.predict(Xt)
    accs.append(np.mean(pred == yt))

best_K = K_values[np.argmax(accs)]
print(f"\nMelhor K = {best_K} (Validation Accuracy = {max(accs):.3f})")



Melhor K = 1 (Validation Accuracy = 0.750)


### 13. SVM (Linear+Non-Linear)

In [None]:
def clipAlphasJ(aj,H,L):
      if aj > H:
            aj = H
      if L > aj:
            aj = L
      return aj

def selectJrandom(i,m):
      j=i
      while (j==i):
            j = int(np.random.uniform(0,m))
      return j

def simplifiedSMO(dataX, classY):
       C=1e10
       tol=1e-3
       max_passes=100
       X = np.mat(dataX)
       Y = np.mat(classY).T
       m,n = np.shape(X)
       # Initialize b: threshold for solution
       b = 0;      
       # Initialize alphas: lagrange multipliers for solution
       alphas = np.mat(np.zeros((m,1)))
       passes = 0
       while (passes < max_passes):
              num_changed_alphas = 0
              for i in range(m):
                     # Calculate Ei = f(xi) - yi
                     fXi = float(np.multiply(alphas,Y).T*(X*X[i,:].T)) + b
                     Ei = fXi - float(Y[i])
                     if ((Y[i]*Ei < -tol) and (alphas[i] < C)) or ((Y[i]*Ei > tol) 
                                  and (alphas[i] > 0)):
                           # select j # i randomly
                           j = selectJrandom(i,m)
                           # Calculate Ej = f(xj) - yj
                           fXj = float(np.multiply(alphas,Y).T*(X*X[j,:].T)) + b
                           Ej = fXj - float(Y[j])
                           # save old alphas's
                           alphaIold = alphas[i].copy();
                           alphaJold = alphas[j].copy();
                           # compute L and H
                           if (Y[i] != Y[j]):
                                  L = max(0, alphas[j] - alphas[i])
                                  H = min(C, C + alphas[j] - alphas[i])
                           else:
                                  L = max(0, alphas[j] + alphas[i] - C)
                                  H = min(C, alphas[j] + alphas[i])
                           # if L = H the continue to next i
                           if L==H:
                                  continue
                           # compute eta
                           eta = 2.0 * X[i,:]*X[j,:].T - X[i,:]*X[i,:].T - X[j,:]*X[j,:].T
                           # if eta >= 0 then continue to next i
                           if eta >= 0:
                                  continue
                           # compute new value for alphas j
                           alphas[j] -= Y[j]*(Ei - Ej)/eta
                           # clip new value for alphas j
                           alphas[j] = clipAlphasJ(alphas[j],H,L)
                           # if |alphasj - alphasold| < 0.00001 then continue to next i
                           if (abs(alphas[j] - alphaJold) < 0.00001):
                                  continue
                           # determine value for alphas i
                           alphas[i] += Y[j]*Y[i]*(alphaJold - alphas[j])
                           # compute b1 and b2
                           b1 = b - Ei- Y[i]*(alphas[i]-alphaIold)*X[i,:]*X[i,:].T -Y[j]*(alphas[j]-alphaJold)*X[i,:]*X[j,:].T
                           b2 = b - Ej- Y[i]*(alphas[i]-alphaIold)*X[i,:]*X[j,:].T - Y[j]*(alphas[j]-alphaJold)*X[j,:]*X[j,:].T
                           # compute b
                           if (0 < alphas[i]) and (C > alphas[i]):
                                  b = b1
                           elif (0 < alphas[j]) and (C > alphas[j]):
                                  b = b2
                           else:
                                  b = (b1 + b2)/2.0                      
                           num_changed_alphas += 1
                     if (num_changed_alphas == 0): passes += 1
                     else: passes = 0
       return alphas