In [None]:
import tensorflow as tf
import numpy as np
import random
import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Add, UpSampling1D, LSTM, Dense, Dropout, Flatten, Bidirectional, BatchNormalization, LayerNormalization, Activation, Attention, MultiHeadAttention 
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.metrics import AUC
from sklearn.metrics import r2_score, roc_curve, auc
import time
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib import rc
rc("font",family = "Malgun Gothic")
%matplotlib inline
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import BinaryFocalCrossentropy
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
import seaborn as sns
from sklearn.utils.multiclass import unique_labels
import itertools

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix_rowwise(y_true, y_pred, class_names=None, figsize=(6,6), cmap="Blues", title=None):
    """
    Vẽ confusion matrix row-wise normalized với counts + % trên cùng một ô.
    """
    # Tính CM raw counts
    cm = confusion_matrix(y_true, y_pred, labels=np.arange(len(class_names)))
    
    # Row-wise normalized (%) = mỗi hàng cộng 100%
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # Tạo annot string "count (percent%)"
    annot = np.empty_like(cm).astype(str)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            annot[i,j] = f"{cm[i,j]} ({cm_norm[i,j]:.1f}%)"
    
    # Vẽ heatmap
    plt.figure(figsize=figsize, dpi=120)
    sns.heatmap(cm_norm, annot=annot, fmt="", cmap=cmap, xticklabels=class_names,
                yticklabels=class_names, linewidths=0.8, linecolor='gray', square=True)
    
    # Trục và title
    plt.ylabel("True label", fontsize=12, fontweight='bold')
    plt.xlabel("Predicted label", fontsize=12, fontweight='bold')
    if title is None:
        title = "Confusion Matrix (Counts + Row-wise %)"
    plt.title(title, fontsize=14, fontweight='bold', pad=15)
    
    plt.xticks(rotation=45, ha='right', fontsize=11)
    plt.yticks(rotation=0, fontsize=11)
    plt.grid(False)
    
    # Accuracy từ raw counts
    accuracy = np.trace(cm) / np.sum(cm)
    print(f"Accuracy from CM: {accuracy:.4f}")
    
    plt.tight_layout()
    plt.show()
    
    return cm, cm_norm, accuracy


### Dataset

In [None]:
train_dataset = np.load(r'C:/your_training_dataset.npy',allow_pickle=True)

print("train_dataset:", train_dataset)
print(train_dataset.shape)

In [None]:
test_dataset = np.load(r'C:/your_testing_dataset.npy',allow_pickle=True)

print("test_dataset:", test_dataset)
print(test_dataset.shape)

-------------

### Label

In [None]:
train_label = np.load(r'C:/your_training_Labels.npy',allow_pickle=True)

print("train_label:", train_label)
print(train_label.shape)

In [None]:
test_label = np.load(r'C:/your_testing_Labels.npy',allow_pickle=True)

print("test_label:", test_label)
print(test_label.shape)

-----------

In [None]:
# 2. Data Checks (A) -------------------------------------------
def check_data_distribution(y, set_name):
    unique, counts = np.unique(y, return_counts=True)
    print(f"\n{set_name} Class Distribution:")
    for cls, count in zip(unique, counts):
        print(f"Class {cls}: {count} samples ({count/len(y)*100:.2f}%)")

# Check original data distributions
check_data_distribution(train_label, "Raw Training")
check_data_distribution(test_label, "Raw Testing")

# Check data leakage
common_samples = set(map(tuple, train_dataset.reshape(train_dataset.shape[0], -1))) & \
                 set(map(tuple, test_dataset.reshape(test_dataset.shape[0], -1)))
print(f"\nCommon samples between train and test: {len(common_samples)}")

In [None]:
# Prepare test data
#X_test = test_dataset
#y_test = to_categorical(test_label, num_classes=2)

# Fix all random seeds
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)

# Initialize StratifiedKFold
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
]
y_test = test_label.reshape(-1)

In [None]:
class_names = ['Low CVD Risk', 'High CVD Risk']
# Lists to store metrics
val_accuracies = []
val_losses = []
test_accuracies = []
test_losses = []
histories = []

# ======================== Training Loop ==========================
for fold, (train_idx, val_idx) in enumerate(kfold.split(train_dataset, train_label), start=1):
    print(f"Fold {fold}/{n_splits}")

    X_train_fold, X_val_fold = train_dataset[train_idx], train_dataset[val_idx]
    y_train_fold, y_val_fold = train_label[train_idx].reshape(-1), train_label[val_idx].reshape(-1)

    check_data_distribution(y_train_fold, f"Fold {fold} Training")
    check_data_distribution(y_val_fold, f"Fold {fold} Validation")

    # ======================== Build Model ==========================
    input_layer = Input(shape=(128, 3))
    conv1 = Conv1D(64, kernel_size=3, kernel_regularizer=l2(0.001),
                   kernel_initializer=tf.keras.initializers.glorot_uniform(seed=SEED))(input_layer)
    ln1 = LayerNormalization()(conv1)
    act1 = Activation('elu')(ln1)
    pool1 = MaxPooling1D(pool_size=2)(act1)

    conv2 = Conv1D(32, kernel_size=1, activation='elu')(pool1)
    ln2 = LayerNormalization()(conv2)
    act2 = Activation('elu')(ln2)
    pool2 = MaxPooling1D(pool_size=2)(act2)
    drop0 = Dropout(0.3)(pool2)

    lstm1 = LSTM(32, activation='tanh', return_sequences=True)(drop0)
    drop1 = Dropout(0.2)(lstm1)
    lstm2 = LSTM(16, activation='tanh')(drop1)
    drop2 = Dropout(0.2)(lstm2)
    
    flatten = Flatten()(drop2)
    dense = Dense(16, activation='elu')(flatten)
    output_layer = Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.summary()

    # ======================== Compile with Focal Loss ==========================
    focal_loss = BinaryFocalCrossentropy(alpha=0.80, gamma=2.0)
    model.compile(loss=focal_loss, optimizer=Adamax(learning_rate=0.0005), metrics=['accuracy'])

    # ======================== Train ==========================
    start_time = time.time()
    hist = model.fit(
        X_train_fold, y_train_fold,
        batch_size=128,
        epochs=100,
        validation_data=(X_val_fold, y_val_fold),
        shuffle=True,
        callbacks=callbacks,
        verbose=1
    )
    histories.append(hist.history)
    end_time = time.time()

    # ======================== Evaluate ==========================
    val_loss, val_acc = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    test_loss, test_acc = model.evaluate(test_dataset, y_test, verbose=0)
    val_losses.append(val_loss); val_accuracies.append(val_acc)
    test_losses.append(test_loss); test_accuracies.append(test_acc)

    # ======================== Plots ==========================
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(hist.history['loss'], c='b')
    plt.plot(hist.history['val_loss'], '-.', c='r')
    plt.title(f'Fold {fold} Loss'); plt.ylabel('Loss'); plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')

    plt.subplot(1, 2, 2)
    plt.plot(hist.history['accuracy'], c='b')
    plt.plot(hist.history['val_accuracy'], '-.', c='r')
    plt.title(f'Fold {fold} Accuracy'); plt.ylabel('Accuracy'); plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='lower right')
    plt.show()

    print(f"\nFold {fold} Results:")
    print(f"Validation Loss: {val_loss:.4f} | Accuracy: {val_acc:.4f}")
    print(f"Test Loss: {test_loss:.4f} | Accuracy: {test_acc:.4f}")
    print(f"Training Time: {end_time - start_time:.2f} seconds")

    # ======================== Confusion Matrix + AUC ==========================
    # Validation
    y_val_prob = model.predict(X_val_fold).ravel()
    y_val_pred = (y_val_prob > 0.5).astype(int)
    plot_confusion_matrix_rowwise(y_val_fold, y_val_pred, class_names=class_names, 
                                  title=f"Fold {fold} Validation CM")
    print(f"Validation AUC: {roc_auc_score(y_val_fold, y_val_prob):.4f}")
    
    # Test
    y_test_prob = model.predict(test_dataset).ravel()
    y_test_pred = (y_test_prob > 0.5).astype(int)
    plot_confusion_matrix_rowwise(y_test, y_test_pred, class_names=class_names, 
                                  title=f"Fold {fold} Test CM")
    print(f"Test AUC: {roc_auc_score(y_test, y_test_prob):.4f}")


# ======================== Final CV Metrics ==========================
print("\n\nFinal Cross-Validation Results:")
print(f"Validation Loss: {np.mean(val_losses):.4f} ± {np.std(val_losses):.4f}")
print(f"Validation Accuracy: {np.mean(val_accuracies):.4f} ± {np.std(val_accuracies):.4f}")
print(f"Test Loss: {np.mean(test_losses):.4f} ± {np.std(test_losses):.4f}")
print(f"Test Accuracy: {np.mean(test_accuracies):.4f} ± {np.std(test_accuracies):.4f}")

In [None]:
X_test = test_dataset
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# ----------------- K-fold ROC storage -----------------
y_test_probs_all = []  # lưu xác suất class 1 (High Risk)
y_test_labels = y_test if y_test.ndim == 1 else y_test.reshape(-1)

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_dataset, train_label), start=1):
    
    # ... training như code bạn đã có ...
    
    # Dự đoán xác suất trên test set
    y_test_pred_prob = model.predict(X_test)  # shape (n_samples, 1)
    y_test_probs_all.append(y_test_pred_prob.flatten())  # sigmoid -> chỉ 1 output

# ----------------- Tính mean ROC và CI -----------------
all_fpr = np.linspace(0, 1, 100)
tprs = []

for y_pred_high in y_test_probs_all:
    fpr, tpr, _ = roc_curve(y_test_labels, y_pred_high)
    tpr_interp = np.interp(all_fpr, fpr, tpr)
    tpr_interp[0] = 0.0
    tprs.append(tpr_interp)

tprs = np.array(tprs)
mean_tpr = tprs.mean(axis=0)
std_tpr = tprs.std(axis=0)
roc_auc_mean = auc(all_fpr, mean_tpr)
tpr_upper = np.minimum(mean_tpr + std_tpr, 1)
tpr_lower = np.maximum(mean_tpr - std_tpr, 0)

# ----------------- Vẽ ROC với uncertainty -----------------
plt.figure(figsize=(8,6), dpi=120)
plt.fill_between(all_fpr, tpr_lower, tpr_upper, color='darkorange', alpha=0.2, label='±1 std. dev.')
plt.plot(all_fpr, mean_tpr, color='darkorange', lw=3, label=f'Mean ROC (AUC={roc_auc_mean:.3f})')
plt.plot([0,1], [0,1], color='gray', lw=1, linestyle='--', label='Random Classifier')

plt.title('Test ROC Curve with Uncertainty (k-fold)', fontsize=14, fontweight='bold')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(loc='lower right', frameon=True, facecolor='white', edgecolor='gray')
plt.tight_layout()
plt.show()


In [None]:
X_test = test_dataset
# ------------------- Labels (không cần one-hot) -------------------
y_train_all = train_label.reshape(-1)   # giữ dạng 0/1

# ------------------- Build final model -------------------
input_layer = Input(shape=(128, 3))

# Encoder Block
conv1 = Conv1D(64, kernel_size=3, kernel_regularizer=l2(0.001),
               kernel_initializer=tf.keras.initializers.glorot_uniform(seed=SEED))(input_layer)
ln1 = LayerNormalization()(conv1)
act1 = Activation('elu')(ln1)
pool1 = MaxPooling1D(pool_size=2)(act1)

conv2 = Conv1D(32, kernel_size=1, activation='elu')(pool1)
ln2 = LayerNormalization()(conv2)
act2 = Activation('elu')(ln2)
pool2 = MaxPooling1D(pool_size=2)(act2)
drop0 = Dropout(0.3)(pool2)

# LSTM Block
lstm1 = LSTM(32, activation='tanh', return_sequences=True)(drop0)
drop1 = Dropout(0.2)(lstm1)
lstm2 = LSTM(16, activation='tanh')(drop1)
drop2 = Dropout(0.2)(lstm2)

# Output
flatten = Flatten(name="flatten_final")(drop2)
dense = Dense(16, activation='elu')(flatten)
output_layer = Dense(1, activation='sigmoid')(dense)   # sigmoid cho binary

model_final = Model(inputs=input_layer, outputs=output_layer)
model_final.summary()

In [None]:

# ------------------- Compile với Focal Loss -------------------
focal_loss = BinaryFocalCrossentropy(alpha=0.8, gamma=2.0)  # gamma có thể tune
model_final.compile(loss=focal_loss,
                   optimizer=Adamax(learning_rate=0.0005),
                   metrics=['accuracy'])

# ------------------- Train final model -------------------
start_time = time.time()
hist_final = model_final.fit(train_dataset, y_train_all,
                            batch_size=128,
                            epochs=100,
                            shuffle=True,
                            verbose=1)
end_time = time.time()
print(f"\nFinal Model Training Time: {end_time - start_time:.2f} seconds")


In [None]:

# ------------------- Plot Training Curves -------------------
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(hist_final.history['loss'], c='b', label='Training Loss')
plt.title('Final Model Training Loss'); plt.ylabel('Loss'); plt.xlabel('Epoch')
plt.grid(linestyle='--'); plt.legend()

plt.subplot(1, 2, 2)
plt.plot(hist_final.history['accuracy'], c='g', label='Training Accuracy')
plt.title('Final Model Training Accuracy'); plt.ylabel('Accuracy'); plt.xlabel('Epoch')
plt.grid(linestyle='--'); plt.legend()

plt.tight_layout(); plt.show()

# ------------------- Evaluate on Test -------------------
score = model_final.evaluate(X_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# ------------------- ROC Curve -------------------
from sklearn.metrics import roc_curve, auc

y_pred_prob = model_final.predict(X_test).flatten()  # shape (n_samples,)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6), dpi=120)
plt.fill_between(fpr, tpr, alpha=0.2, color='darkorange')
plt.plot(fpr, tpr, color='darkorange', lw=3, label=f'ROC Curve (AUC={roc_auc:.3f})')
plt.plot([0,1], [0,1], color='gray', lw=1, linestyle='--', label='Random Classifier')
plt.title('Receiver Operating Characteristic (ROC)', fontsize=14, fontweight='bold')
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.legend(loc='lower right'); plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout(); plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Tên lớp
class_names = ['Low CVD Risk', 'High CVD Risk']

# ==================== Hàm vẽ CM row-wise ====================
def plot_confusion_matrix_rowwise(y_true, y_pred, class_names=None, figsize=(6,6), cmap=plt.cm.Blues):
    """
    Vẽ confusion matrix với counts + normalized 0-1 (row-wise) trong cùng 1 ô.
    """
    # Tính confusion matrix raw counts
    cm = confusion_matrix(y_true, y_pred, labels=np.arange(len(class_names)))
    
    # Normalize row-wise
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    # Tạo annot string "count (norm)"
    annot = np.empty_like(cm).astype(str)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            annot[i,j] = f"{cm[i,j]} ({cm_norm[i,j]:.2f})"
    
    # Vẽ heatmap
    plt.figure(figsize=figsize)
    sns.heatmap(cm_norm, annot=annot, fmt="", cmap=cmap,
                xticklabels=class_names, yticklabels=class_names)
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.title("Confusion Matrix (Counts + Row-wise%)")
    plt.show()
    
    # Accuracy từ raw counts
    accuracy = np.trace(cm) / np.sum(cm)
    print(f"Accuracy from CM: {accuracy:.4f}")
    
    return cm, cm_norm, accuracy

# ==================== Tính nhãn dự đoán ====================
y_true = y_test.reshape(-1)
y_pred_prob = model_final.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)

# ==================== Vẽ CM ====================
cm, cm_norm, acc = plot_confusion_matrix_rowwise(y_true, y_pred, class_names)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.manifold import TSNE
from tensorflow.keras.models import Model

# Đặt font mặc định (Arial, fallback sang DejaVu Sans nếu không có)
matplotlib.rcParams['font.family'] = 'Arial'

# --- Trích xuất feature từ model ---
# Tìm layer Flatten cuối cùng (tự động)
flatten_layers = [layer.name for layer in model_final.layers if "flatten" in layer.name]
last_flatten = flatten_layers[-1]  # lấy flatten cuối
intermediate_layer_model = Model(
    inputs=model_final.input,
    outputs=model_final.get_layer(last_flatten).output
)

# Trích xuất features
features_before = train_dataset.reshape(train_dataset.shape[0], -1)  # dữ liệu gốc (chưa học)
features_after = intermediate_layer_model.predict(train_dataset, verbose=0)  # feature sau khi học

# Convert nhãn về numpy
y_train_labels = np.array(train_label).astype(int)


# --- Hàm vẽ t-SNE ---
def plot_tsne(features, labels, title="t-SNE Visualization"):
    tsne = TSNE(
        n_components=2,
        random_state=42,
        perplexity=30,
        max_iter=1000,
        learning_rate="auto",
        init="pca"
    )
    reduced = tsne.fit_transform(features)

    # Gán màu cố định cho từng class
    colors = {0: "#d62728",  # đỏ cho High risk
              1: "#1f77b4"}  # xanh cho Low risk
    class_names = {0: "Low CVD Risk", 1: "High CVD Risk"}

    plt.figure(figsize=(8, 6), dpi=120)
    for label in sorted(set(labels)):
        idx = labels == label
        plt.scatter(
            reduced[idx, 0],
            reduced[idx, 1],
            c=colors[label],
            label=class_names[label],
            alpha=0.75,
            s=60,
            edgecolors="k",   # viền đen mỏng
            linewidth=0.5
        )

    # Thiết kế chuyên nghiệp
    plt.title(title, fontsize=14, fontweight="bold", pad=15)
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.legend(frameon=True, facecolor="white", framealpha=0.9, edgecolor="gray")
    plt.tight_layout()
    plt.show()


# --- Vẽ ---
plot_tsne(features_before, y_train_labels, "t-SNE Before Training (Final Model)")
plot_tsne(features_after, y_train_labels, "t-SNE After Training (Final Model)")
