In [5]:
# MODEL cơ bản nhất - tham số tối ưu nhất

import json
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
# Đọc dữ liệu
with open('Datasets/train.json', 'r') as file:
    train_data = json.load(file)
with open('Datasets/test.json', 'r') as file:
    test_data = json.load(file)

train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

# Chuẩn bị embedding
train_X = np.stack(train_data['audio_embedding'].apply(lambda x: np.mean(x, axis=0)))
train_Y = train_data['is_turkey'].values

valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
test_X = np.stack(test_data.loc[valid_idx, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0)))

# Chuẩn hóa
scaler = StandardScaler()
Z = scaler.fit_transform(train_X)
test_Z = scaler.transform(test_X)

# Chia tập train/val
train_Z, val_Z, train_Y, val_Y = train_test_split(Z, train_Y, test_size=0.2, random_state=45)

# Xây mô hình MLP
model = Sequential([
    Dense(200, activation='relu', input_shape=(train_Z.shape[1],)),
    BatchNormalization(),
    Dense(50, activation='relu'),
    # # Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['auc']
)

# Huấn luyện
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    train_Z, train_Y,
    validation_data=(val_Z, val_Y),
    epochs=80,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# Dự đoán trên tập validation
y_pred_prob = model.predict(val_Z).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

# In các chỉ số đánh giá
print("AUC Score   :", roc_auc_score(val_Y, y_pred_prob))
print("Accuracy    :", accuracy_score(val_Y, y_pred))
print("Precision   :", precision_score(val_Y, y_pred))
print("Recall      :", recall_score(val_Y, y_pred))
print("F1 Score    :", f1_score(val_Y, y_pred))

# Dự đoán trên tập test
test_pred_prob = model.predict(test_Z).ravel()
test_data['is_turkey'] = -1.0
test_data.loc[valid_idx, 'is_turkey'] = test_pred_prob

# Lưu kết quả
test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('result.csv', index=False)


Epoch 1/80


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - auc: 0.8841 - loss: 0.3908 - val_auc: 0.9930 - val_loss: 0.1433
Epoch 2/80
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9949 - loss: 0.1016 - val_auc: 0.9943 - val_loss: 0.1291
Epoch 3/80
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9939 - loss: 0.1017 - val_auc: 0.9944 - val_loss: 0.1248
Epoch 4/80
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9991 - loss: 0.0633 - val_auc: 0.9945 - val_loss: 0.1108
Epoch 5/80
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9997 - loss: 0.0359 - val_auc: 0.9962 - val_loss: 0.0935
Epoch 6/80
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9999 - loss: 0.0330 - val_auc: 0.9935 - val_loss: 0.1075
Epoch 7/80
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9999 - loss: 0.02

In [None]:
#Vòng lặp

import json
import numpy as np
import pandas as pd
import time
import tracemalloc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Load dữ liệu
with open('Datasets/train.json', 'r') as file:
    train_data = json.load(file)
with open('Datasets/test.json', 'r') as file:
    test_data = json.load(file)

train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

# Tiền xử lý
train_X = np.stack(train_data['audio_embedding'].apply(lambda x: np.mean(x, axis=0)))
train_Y = train_data['is_turkey'].values

valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
test_X = np.stack(test_data.loc[valid_idx, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0)))

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
Z = scaler.fit_transform(train_X)
test_Z = scaler.transform(test_X)

# Chia train / validation
train_Z, val_Z, train_Y, val_Y = train_test_split(Z, train_Y, test_size=0.3, random_state=97)

# Danh sách lưu các mô hình thỏa điều kiện
qualified_models = []

# Huấn luyện
for i in range(200):
    print(f"\n▶️ Huấn luyện lần {i + 1}")

    start_time = time.time()
    tracemalloc.start()

    model = Sequential([
        Input(shape=(train_Z.shape[1],)),
        Dense(300, activation='relu'),
        Dense(100, activation='tanh'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])

    early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    model.fit(
        train_Z, train_Y,
        validation_data=(val_Z, val_Y),
        epochs=200,
        batch_size=32,
        callbacks=[early_stop],
        verbose=0
    )

    # Dự đoán validation
    y_pred_prob = model.predict(val_Z).ravel()
    y_pred = (y_pred_prob >= 0.5).astype(int)

    auc = roc_auc_score(val_Y, y_pred_prob)
    acc = accuracy_score(val_Y, y_pred)
    prec = precision_score(val_Y, y_pred)
    rec = recall_score(val_Y, y_pred)
    f1 = f1_score(val_Y, y_pred)

    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    if acc > 0.97 and prec >= 0.97 and rec >= 0.95:
        print(f"AUC       : {auc:.4f}")
        print(f"Accuracy  : {acc:.4f}")
        print(f"Precision : {prec:.4f}")
        print(f"Recall    : {rec:.4f}")
        print(f"F1 Score  : {f1:.4f}")
        print(f"⏱ Thời gian huấn luyện: {end_time - start_time:.2f} giây")
        print(f"📦 Bộ nhớ đỉnh sử dụng: {peak / 1024 / 1024:.2f} MB")

        test_pred = model.predict(test_Z).ravel()

        qualified_models.append({
            "model": model,
            "precision": prec,
            "auc": auc,
            "acc": acc,
            "recall": rec,
            "f1": f1,
            "time": end_time - start_time,
            "memory": peak / 1024 / 1024,
            "test_pred": test_pred
        })

# Sắp xếp theo precision giảm dần
qualified_models = sorted(qualified_models, key=lambda x: x["precision"], reverse=True)

# Lưu 3 mô hình có precision cao nhất
top_models = qualified_models[:3]

# In thông tin các mô hình được lưu
for idx, m in enumerate(top_models):
    print(f"\n✅ model{idx + 1} (Precision: {m['precision']:.4f}):")
    print(f"AUC      : {m['auc']:.4f}")
    print(f"Accuracy : {m['acc']:.4f}")
    print(f"Precision: {m['precision']:.4f})")
    print(f"Recall   : {m['recall']:.4f}")
    print(f"F1 Score : {m['f1']:.4f}")
    print(f"⏱ Time   : {m['time']:.2f} giây")
    print(f"📦 Memory : {m['memory']:.2f} MB")

# Dự đoán và lưu kết quả với model1
if top_models:
    test_data['is_turkey'] = -1.0
    test_data.loc[valid_idx, 'is_turkey'] = top_models[0]['test_pred']
    test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('result.csv', index=False)
else:
    print("\n❌ Không có mô hình nào đạt yêu cầu.")



▶️ Huấn luyện lần 1
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step

▶️ Huấn luyện lần 2
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 3
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 4
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 6
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 7
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 8
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step

▶️ Huấn luyện lần 9
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step

▶️ Huấn luyện lần 10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 11
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━

In [2]:
test_data['is_turkey'] = -1.0
test_data.loc[valid_idx, 'is_turkey'] = top_models[1]['test_pred']
test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('result.csv', index=False)

In [1]:
# STD, MIN, MAX, MEAN
import json
import numpy as np
import pandas as pd
import time
import tracemalloc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping


# ===========================
# Hàm trích xuất đặc trưng mở rộng
# ===========================
def extract_features(embedding_list):
    """
    Trích xuất đặc trưng từ audio_embedding bằng cách nối:
    - Trung bình
    - Độ lệch chuẩn
    - Giá trị nhỏ nhất
    - Giá trị lớn nhất
    """
    vec = np.array(embedding_list)
    features = np.concatenate([
        np.mean(vec, axis=0),
        np.std(vec, axis=0),
        np.min(vec, axis=0),
        np.max(vec, axis=0)
    ])
    return features


# ===========================
# Load dữ liệu
# ===========================
with open('Datasets/train.json', 'r') as file:
    train_data = json.load(file)
with open('Datasets/test.json', 'r') as file:
    test_data = json.load(file)

train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

# ===========================
# Tiền xử lý
# ===========================
train_X = np.stack(train_data['audio_embedding'].apply(extract_features))
train_Y = train_data['is_turkey'].values

valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
test_X = np.stack(test_data.loc[valid_idx, 'audio_embedding'].apply(extract_features))

# ===========================
# Chuẩn hóa dữ liệu
# ===========================
scaler = StandardScaler()
Z = scaler.fit_transform(train_X)
test_Z = scaler.transform(test_X)

# ===========================
# Chia train / validation
# ===========================
train_Z, val_Z, train_Y, val_Y = train_test_split(Z, train_Y, test_size=0.25, random_state=45)

# ===========================
# Vòng lặp huấn luyện
# ===========================
qualified_models = []

for i in range(150):
    print(f"\n▶️ Huấn luyện lần {i + 1}")

    start_time = time.time()
    tracemalloc.start()

    model = Sequential([
        Input(shape=(train_Z.shape[1],)),
        Dense(200, activation='gelu'),
        Dense(100, activation='relu'),
        # Dense(100, activation='tanh'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])

    early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    model.fit(
        train_Z, train_Y,
        validation_data=(val_Z, val_Y),
        epochs=200,
        batch_size=32,
        callbacks=[early_stop],
        verbose=0
    )

    # Dự đoán validation
    y_pred_prob = model.predict(val_Z).ravel()
    y_pred = (y_pred_prob >= 0.5).astype(int)

    auc = roc_auc_score(val_Y, y_pred_prob)
    acc = accuracy_score(val_Y, y_pred)
    prec = precision_score(val_Y, y_pred)
    rec = recall_score(val_Y, y_pred)
    f1 = f1_score(val_Y, y_pred)

    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    if prec >= 0.97 and rec >= 0.95:
        print(f"AUC       : {auc:.4f}")
        print(f"Accuracy  : {acc:.4f}")
        print(f"Precision : {prec:.4f}")
        print(f"Recall    : {rec:.4f}")
        print(f"F1 Score  : {f1:.4f}")
        print(f"⏱ Thời gian huấn luyện: {end_time - start_time:.2f} giây")
        print(f"📦 Bộ nhớ đỉnh sử dụng: {peak / 1024 / 1024:.2f} MB")

        test_pred = model.predict(test_Z).ravel()

        qualified_models.append({
            "model": model,
            "precision": prec,
            "auc": auc,
            "acc": acc,
            "recall": rec,
            "f1": f1,
            "time": end_time - start_time,
            "memory": peak / 1024 / 1024,
            "test_pred": test_pred
        })

# ===========================
# Chọn top model
# ===========================
qualified_models = sorted(qualified_models, key=lambda x: x["precision"], reverse=True)
top_models = qualified_models[:3]

for idx, m in enumerate(top_models):
    print(f"\n✅ model{idx + 1} (Precision: {m['precision']:.4f}):")
    print(f"AUC      : {m['auc']:.4f}")
    print(f"Accuracy : {m['acc']:.4f}")
    print(f"Precision: {m['precision']:.4f}")
    print(f"Recall   : {m['recall']:.4f}")
    print(f"F1 Score : {m['f1']:.4f}")
    print(f"⏱ Time   : {m['time']:.2f} giây")
    print(f"📦 Memory : {m['memory']:.2f} MB")

# ===========================
# Xuất kết quả
# ===========================
if top_models:
    test_data['is_turkey'] = -1.0
    test_data.loc[valid_idx, 'is_turkey'] = top_models[0]['test_pred']
    test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('result.csv', index=False)
else:
    print("\n❌ Không có mô hình nào đạt yêu cầu.")



▶️ Huấn luyện lần 1
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

▶️ Huấn luyện lần 2
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

▶️ Huấn luyện lần 3
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

▶️ Huấn luyện lần 4
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

▶️ Huấn luyện lần 5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

▶️ Huấn luyện lần 6
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

▶️ Huấn luyện lần 7
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

▶️ Huấn luyện lần 8
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

▶️ Huấn luyện lần 9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

▶️ Huấn luyện lần 10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step

▶️ Huấn luyện lần 11
[1m10/10[0m [32m━━━━━━━━

In [4]:
test_data['is_turkey'] = -1.0
test_data.loc[valid_idx, 'is_turkey'] = top_models[2]['test_pred']
test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('result.csv', index=False)

In [None]:
# Regularization
import json
import numpy as np
import pandas as pd
import time
import tracemalloc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers  # ⬅️ Thêm import regularization

# Load dữ liệu
with open('Datasets/train.json', 'r') as file:
    train_data = json.load(file)
with open('Datasets/test.json', 'r') as file:
    test_data = json.load(file)

train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

# Tiền xử lý
train_X = np.stack(train_data['audio_embedding'].apply(lambda x: np.mean(x, axis=0)))
train_Y = train_data['is_turkey'].values

valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
test_X = np.stack(test_data.loc[valid_idx, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0)))

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
Z = scaler.fit_transform(train_X)
test_Z = scaler.transform(test_X)

# Chia train / validation
train_Z, val_Z, train_Y, val_Y = train_test_split(Z, train_Y, test_size=0.6, random_state=97)

# Danh sách lưu các mô hình thỏa điều kiện
qualified_models = []

# Huấn luyện
for i in range(100):
    print(f"\n▶️ Huấn luyện lần {i + 1}")

    start_time = time.time()
    tracemalloc.start()

    model = Sequential([
        Input(shape=(train_Z.shape[1],)),
        Dense(500, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),  # ⬅️ Regularization
        LayerNormalization(),
        Dense(500, activation='tanh', kernel_regularizer=regularizers.l2(1e-4)),  # ⬅️ Regularization
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])

    early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    model.fit(
        train_Z, train_Y,
        validation_data=(val_Z, val_Y),
        epochs=80,
        batch_size=8,
        callbacks=[early_stop],
        verbose=0
    )

    # Dự đoán validation
    y_pred_prob = model.predict(val_Z).ravel()
    y_pred = (y_pred_prob >= 0.5).astype(int)

    auc = roc_auc_score(val_Y, y_pred_prob)
    acc = accuracy_score(val_Y, y_pred)
    prec = precision_score(val_Y, y_pred)
    rec = recall_score(val_Y, y_pred)
    f1 = f1_score(val_Y, y_pred)

    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    if prec >= 0.975:
        print(f"AUC       : {auc:.4f}")
        print(f"Accuracy  : {acc:.4f}")
        print(f"Precision : {prec:.4f}")
        print(f"Recall    : {rec:.4f}")
        print(f"F1 Score  : {f1:.4f}")
        print(f"⏱ Thời gian huấn luyện: {end_time - start_time:.2f} giây")
        print(f"📦 Bộ nhớ đỉnh sử dụng: {peak / 1024 / 1024:.2f} MB")

        test_pred = model.predict(test_Z).ravel()

        qualified_models.append({
            "model": model,
            "precision": prec,
            "auc": auc,
            "acc": acc,
            "recall": rec,
            "f1": f1,
            "time": end_time - start_time,
            "memory": peak / 1024 / 1024,
            "test_pred": test_pred
        })

# Sắp xếp theo precision giảm dần
qualified_models = sorted(qualified_models, key=lambda x: x["precision"], reverse=True)

# Lưu 3 mô hình có precision cao nhất
top_models = qualified_models[:3]

# In thông tin các mô hình được lưu
for idx, m in enumerate(top_models):
    print(f"\n✅ model{idx + 1} (Precision: {m['precision']:.4f}):")
    print(f"AUC      : {m['auc']:.4f}")
    print(f"Accuracy : {m['acc']:.4f}")
    print(f"Precision: {m['precision']:.4f})")
    print(f"Recall   : {m['recall']:.4f}")
    print(f"F1 Score : {m['f1']:.4f}")
    print(f"⏱ Time   : {m['time']:.2f} giây")
    print(f"📦 Memory : {m['memory']:.2f} MB")

# Dự đoán và lưu kết quả với model1
if top_models:
    test_data['is_turkey'] = -1.0
    test_data.loc[valid_idx, 'is_turkey'] = top_models[0]['test_pred']
    test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('result.csv', index=False)
else:
    print("\n❌ Không có mô hình nào đạt yêu cầu.")



▶️ Huấn luyện lần 1
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 2
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

▶️ Huấn luyện lần 3
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

▶️ Huấn luyện lần 4
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

▶️ Huấn luyện lần 5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

▶️ Huấn luyện lần 6
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 7
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

▶️ Huấn luyện lần 8
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

▶️ Huấn luyện lần 9
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step

▶️ Huấn luyện lần 10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

▶️ Huấn luyện lần 11
[1m23/23[0m [32m━━━━━━━━━━━━━━━━

In [None]:
# # K FOLD with Weighted Ensemble
# import json
# import numpy as np
# import pandas as pd
# import time
# import tracemalloc

# from sklearn.model_selection import KFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Input
# from tensorflow.keras.callbacks import EarlyStopping

# # Load dữ liệu
# with open('Datasets/train.json', 'r') as file:
#     train_data = json.load(file)
# with open('Datasets/test.json', 'r') as file:
#     test_data = json.load(file)

# train_data = pd.DataFrame(train_data)
# test_data = pd.DataFrame(test_data)

# # Tiền xử lý
# train_X = np.stack(train_data['audio_embedding'].apply(lambda x: np.mean(x, axis=0)))
# train_Y = train_data['is_turkey'].values

# valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
# if valid_idx.sum() == 0:
#     raise ValueError("❌ Không có dữ liệu audio_embedding hợp lệ trong test_data. Vui lòng kiểm tra lại file test.json.")
# test_X = np.stack(test_data.loc[valid_idx, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0)))

# # Chuẩn hóa
# scaler = StandardScaler()
# Z = scaler.fit_transform(train_X)
# test_Z = scaler.transform(test_X)

# # Danh sách các mô hình tổng hợp sau mỗi vòng ensemble
# ensemble_models = []

# # Chạy nhiều vòng ensemble
# for round_idx in range(5):
#     print(f"\n🔁 Round {round_idx + 1}")

#     kf = KFold(n_splits=11, shuffle=True, random_state=35 + round_idx)
#     fold_models = []
#     fold_precisions = []
#     fold_test_preds = []
#     fold_metrics = []

#     for fold, (train_idx, val_idx) in enumerate(kf.split(Z)):
#         print(f"\n🌀 Fold {fold + 1}")

#         X_train, X_val = Z[train_idx], Z[val_idx]
#         y_train, y_val = train_Y[train_idx], train_Y[val_idx]

#         start_time = time.time()
#         tracemalloc.start()

#         model = Sequential([
#             Input(shape=(X_train.shape[1],)),
#             Dense(200, activation='relu'),
#             Dense(1, activation='sigmoid')
#         ])
#         model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

#         early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

#         model.fit(
#             X_train, y_train,
#             validation_data=(X_val, y_val),
#             epochs=80,
#             batch_size=32,
#             callbacks=[early_stop],
#             verbose=0
#         )

#         y_pred_prob = model.predict(X_val).ravel()
#         y_pred = (y_pred_prob >= 0.5).astype(int)

#         auc = roc_auc_score(y_val, y_pred_prob)
#         acc = accuracy_score(y_val, y_pred)
#         prec = precision_score(y_val, y_pred)
#         rec = recall_score(y_val, y_pred)
#         f1 = f1_score(y_val, y_pred)

#         end_time = time.time()
#         current, peak = tracemalloc.get_traced_memory()
#         tracemalloc.stop()

#         print(f"AUC       : {auc:.4f}")
#         print(f"Accuracy  : {acc:.4f}")
#         print(f"Precision : {prec:.4f}")
#         print(f"Recall    : {rec:.4f}")
#         print(f"F1 Score  : {f1:.4f}")

#         if prec >= 0.98:
#             fold_models.append(model)
#             fold_precisions.append(prec)
#             fold_test_preds.append(model.predict(test_Z).ravel())
#             fold_metrics.append((auc, acc, prec, rec, f1))

#     if len(fold_models) == 0:
#         print("⚠️ Không có mô hình nào đạt precision >= 0.98 trong round này.")
#         continue

#     fold_precisions = np.array(fold_precisions)
#     weights = fold_precisions / fold_precisions.sum()

#     weighted_preds = np.average(fold_test_preds, axis=0, weights=weights)

#     avg_precision = fold_precisions.mean()
#     ensemble_models.append({
#         "test_pred": weighted_preds,
#         "avg_precision": avg_precision,
#         "weights": weights,
#         "metrics": fold_metrics
#     })

# # Chọn 3 mô hình có precision trung bình cao nhất
# ensemble_models = sorted(ensemble_models, key=lambda x: -x['avg_precision'])

# print("\n🏆 Top 3 mô hình có precision cao nhất:")
# for i, model in enumerate(ensemble_models[:3]):
#     print(f"\n✨ Model #{i+1} (Precision trung bình: {model['avg_precision']:.4f})")
#     avg_auc = np.mean([m[0] for m in model['metrics']])
#     avg_acc = np.mean([m[1] for m in model['metrics']])
#     avg_prec = np.mean([m[2] for m in model['metrics']])
#     avg_rec = np.mean([m[3] for m in model['metrics']])
#     avg_f1 = np.mean([m[4] for m in model['metrics']])
#     print(f"AUC       : {avg_auc:.4f}")
#     print(f"Accuracy  : {avg_acc:.4f}")
#     print(f"Precision : {avg_prec:.4f}")
#     print(f"Recall    : {avg_rec:.4f}")
#     print(f"F1 Score  : {avg_f1:.4f}")

# # Dự đoán tập test bằng mô hình có precision cao nhất
# best_model = ensemble_models[0]
# test_data['is_turkey'] = -1.0
# test_data.loc[valid_idx, 'is_turkey'] = best_model['test_pred']
# test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('result.csv', index=False)

# print("\n✅ Kết thúc. Dự đoán đã được lưu vào result.csv")



🔁 Round 1

🌀 Fold 1
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
AUC       : 0.9774
Accuracy  : 0.9358
Precision : 0.9500
Recall    : 0.8837
F1 Score  : 0.9157

🌀 Fold 2
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
AUC       : 0.9987
Accuracy  : 0.9633
Precision : 0.9800
Recall    : 0.9423
F1 Score  : 0.9608
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 973us/step

🌀 Fold 3
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
AUC       : 0.9891
Accuracy  : 0.9541
Precision : 0.9524
Recall    : 0.9302
F1 Score  : 0.9412

🌀 Fold 4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
AUC       : 0.9852
Accuracy  : 0.9541
Precision : 0.9318
Recall    : 0.9535
F1 Score  : 0.9425

🌀 Fold 5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
AUC       : 0.9907
Accuracy  : 0.9541
Precision : 0.9714
Recall    : 0.8947
F1 Score  : 0.9315

🌀 Fold 6
[1m4/4[0m [32m━━

In [5]:
best_model = ensemble_models[2]
test_data['is_turkey'] = -1.0
test_data.loc[valid_idx, 'is_turkey'] = best_model['test_pred']
test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('result.csv', index=False)

In [10]:
import json
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
# Đọc dữ liệu
with open('Datasets/train.json', 'r') as file:
    train_data = json.load(file)
with open('Datasets/test.json', 'r') as file:
    test_data = json.load(file)

train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

# Chuẩn bị embedding: lấy trung bình theo trục 0
train_X = np.stack(train_data['audio_embedding'].apply(lambda x: np.mean(x, axis=0)))
train_Y = train_data['is_turkey'].values

# Thêm Gaussian noise cho Data Augmentation
def add_noise(X, noise_level=0.05):
    noise = np.random.normal(0, noise_level, X.shape)
    return X + noise

# Nhân đôi dữ liệu huấn luyện bằng bản sao nhiễu
aug_X = add_noise(train_X, noise_level=0.05)
aug_Y = train_Y.copy()

# Gộp dữ liệu gốc và dữ liệu nhiễu
train_X_augmented = np.concatenate([train_X, aug_X], axis=0)
train_Y_augmented = np.concatenate([train_Y, aug_Y], axis=0)

# Chuẩn hóa
scaler = StandardScaler()
Z = scaler.fit_transform(train_X_augmented)

# Xử lý test embedding
valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
test_X = np.stack(test_data.loc[valid_idx, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0)))
test_Z = scaler.transform(test_X)

# Chia tập train/val (50/50 từ dữ liệu đã được augment)
train_Z, val_Z, train_Y, val_Y = train_test_split(Z, train_Y_augmented, test_size=0.5, random_state=45)

# Xây mô hình MLP
model = Sequential([
    Dense(10, activation='relu', input_shape=(train_Z.shape[1],)),
    BatchNormalization(),
    Dropout(0.5),
    # Dense(64, activation='relu'),
    # Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Huấn luyện
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    train_Z, train_Y,
    validation_data=(val_Z, val_Y),
    epochs=30,
    batch_size=2,
    callbacks=[early_stop],
    verbose=1
)

# Dự đoán trên tập validation
y_pred_prob = model.predict(val_Z).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

# In các chỉ số đánh giá
print("AUC Score   :", roc_auc_score(val_Y, y_pred_prob))
print("Accuracy    :", accuracy_score(val_Y, y_pred))
print("Precision   :", precision_score(val_Y, y_pred))
print("Recall      :", recall_score(val_Y, y_pred))
print("F1 Score    :", f1_score(val_Y, y_pred))

# Dự đoán trên tập test
test_pred_prob = model.predict(test_Z).ravel()
test_data['is_turkey'] = -1.0
test_data.loc[valid_idx, 'is_turkey'] = test_pred_prob

# Lưu kết quả
test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('mlp_result.csv', index=False)


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5098 - loss: 0.9313 - val_accuracy: 0.8452 - val_loss: 0.3893
Epoch 2/30
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7001 - loss: 0.5983 - val_accuracy: 0.8987 - val_loss: 0.2620
Epoch 3/30
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7129 - loss: 0.5452 - val_accuracy: 0.9155 - val_loss: 0.2454
Epoch 4/30
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7641 - loss: 0.5157 - val_accuracy: 0.9247 - val_loss: 0.2202
Epoch 5/30
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7948 - loss: 0.4934 - val_accuracy: 0.9305 - val_loss: 0.2127
Epoch 6/30
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7703 - loss: 0.5231 - val_accuracy: 0.9280 - val_loss: 0.2220
Epoch 7/30
[1m598/598[0m [32m━━━━━━━

In [119]:
# # JOBLIB 
# import json
# import numpy as np
# import pandas as pd
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping
# import keras_tuner as kt
# from joblib import Parallel, delayed

# # ==== Load dữ liệu ====
# with open('Datasets/train.json', 'r') as file:
#     train_data = json.load(file)
# train_data = pd.DataFrame(train_data)

# with open('Datasets/test.json', 'r') as file:
#     test_data = json.load(file)
# test_data = pd.DataFrame(test_data)

# # ==== Xử lý embedding ====
# train_X = np.stack(train_data['audio_embedding'].apply(lambda x: np.mean(x, axis=0)))
# train_Y = train_data['is_turkey'].values

# valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
# test_X = np.stack(test_data.loc[valid_idx, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0)))

# # ==== Chuẩn hóa ====
# scaler = StandardScaler()
# Z = scaler.fit_transform(train_X)
# test_Z = scaler.transform(test_X)

# # ==== Mô hình ====
# def build_model(hp):
#     model = Sequential()
#     model.add(Dense(
#         units=hp.Int('units1', 64, 256, step=64),
#         activation='relu',
#         input_shape=(Z.shape[1],)
#     ))
#     model.add(Dropout(hp.Float('dropout1', 0.2, 0.5, step=0.1)))

#     model.add(Dense(
#         units=hp.Int('units2', 32, 128, step=32),
#         activation='relu'
#     ))
#     model.add(Dropout(hp.Float('dropout2', 0.2, 0.5, step=0.1)))

#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# # ==== Hàm chạy tuning trên 1 bộ split ====
# def run_tuning(test_size, seed):
#     train_Z, val_Z, train_Y_, val_Y_ = train_test_split(Z, train_Y, test_size=test_size, random_state=seed)

#     tuner = kt.RandomSearch(
#         build_model,
#         objective='val_accuracy',
#         max_trials=5,
#         executions_per_trial=1,
#         overwrite=True,
#         directory='mlp_search',
#         project_name=f'turkey_tune_{test_size}_{seed}'
#     )

#     early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

#     tuner.search(train_Z, train_Y_, epochs=40, validation_data=(val_Z, val_Y_), callbacks=[early_stop], verbose=0)

#     best_model = tuner.get_best_models(1)[0]
#     val_pred_prob = best_model.predict(val_Z).flatten()
#     val_pred = (val_pred_prob >= 0.5).astype(int)
#     auc = roc_auc_score(val_Y_, val_pred_prob)

#     return {
#         'model': best_model,
#         'auc': auc,
#         'val_pred_prob': val_pred_prob,
#         'val_Y': val_Y_,
#         'val_pred': val_pred,
#         'test_size': test_size,
#         'seed': seed
#     }

# # ==== Chạy song song các tổ hợp ====
# param_grid = [(i, j) for i in np.arange(0.1, 0.9, 0.11) for j in range(1, 5)]
# results = Parallel(n_jobs=-1)(delayed(run_tuning)(i, j) for i, j in param_grid)

# # ==== Chọn mô hình tốt nhất ====
# best_result = max(results, key=lambda x: x['auc'])
# best_model = best_result['model']

# print("Best config - test_size:", best_result['test_size'], ", seed:", best_result['seed'])
# print("AUC Score   :", best_result['auc'])
# print("Accuracy    :", accuracy_score(best_result['val_Y'], best_result['val_pred']))
# print("Precision   :", precision_score(best_result['val_Y'], best_result['val_pred']))
# print("Recall      :", recall_score(best_result['val_Y'], best_result['val_pred']))
# print("F1 Score    :", f1_score(best_result['val_Y'], best_result['val_pred']))

# # ==== Dự đoán trên test set ====
# test_pred_prob = best_model.predict(test_Z).flatten()
# test_data['is_turkey'] = -1.0
# test_data.loc[valid_idx, 'is_turkey'] = test_pred_prob

# # ==== Lưu kết quả ====
# test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('mlp_tuned_result.csv', index=False)


In [None]:
# # GRIDSEARCH & KERAS TUNER
# import json
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
# from tensorflow.keras.callbacks import EarlyStopping
# import keras_tuner as kt

# # ==== LOAD DATA ====
# with open('Datasets/train.json', 'r') as file:
#     train_data = json.load(file)
# train_data = pd.DataFrame(train_data)

# with open('Datasets/test.json', 'r') as file:
#     test_data = json.load(file)
# test_data = pd.DataFrame(test_data)

# # ==== EMBEDDING HANDLING ====
# train_X = np.stack(train_data['audio_embedding'].apply(lambda x: np.mean(x, axis=0)))
# train_Y = train_data['is_turkey'].values

# valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
# test_X = np.stack(test_data.loc[valid_idx, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0)))

# # ==== SCALING ====
# scaler = StandardScaler()
# Z = scaler.fit_transform(train_X)
# test_Z = scaler.transform(test_X)

# train_Z, val_Z, train_Y, val_Y = train_test_split(Z, train_Y, test_size=0.2, random_state=42)

# # ==== BUILD MODEL ====
# def build_model(hp):
#     model = Sequential()
    
#     model.add(Dense(
#         units=hp.Int('units1', min_value=64, max_value=512, step=64),
#         activation='relu',
#         input_shape=(train_Z.shape[1],)
#     ))
#     model.add(BatchNormalization())
#     # model.add(Dropout(hp.Float('dropout1', min_value=0.1, max_value=0.6, step=0.1)))

#     model.add(Dense(
#         units=hp.Int('units2', min_value=32, max_value=256, step=32),
#         activation='relu'
#     ))
#     model.add(BatchNormalization())
#     # model.add(Dropout(hp.Float('dropout2', min_value=0.1, max_value=0.6, step=0.1)))

#     model.add(Dense(1, activation='sigmoid'))

#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(),
#         loss='binary_crossentropy',
#         metrics=[tf.keras.metrics.AUC(name='auc')]
#     )
#     return model

# # ==== GRID SEARCH TUNER ====
# tuner = kt.GridSearch(
#     build_model,
#     objective=kt.Objective('val_auc', direction='max'),
#     max_trials=30,               # tăng số lượng thử nghiệm
#     executions_per_trial=2,     # chạy mỗi cấu hình 2 lần
#     overwrite=True,
#     directory='mlp_search',
#     project_name='is_turkey_tune'
# )

# early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# # ==== TUNING ====
# tuner.search(
#     train_Z,
#     train_Y,
#     epochs=100,
#     validation_data=(val_Z, val_Y),
#     callbacks=[early_stop],
#     batch_size=32
# )

# # ==== BEST MODEL ====
# best_model = tuner.get_best_models(num_models=1)[0]
# best_hps = tuner.get_best_hyperparameters(1)[0]

# print("Best Hyperparameters:")
# print(f"units1:    {best_hps.get('units1')}")
# print(f"dropout1:  {best_hps.get('dropout1')}")
# print(f"units2:    {best_hps.get('units2')}")
# print(f"dropout2:  {best_hps.get('dropout2')}")

# # ==== VALIDATION PERFORMANCE ====
# val_pred_prob = best_model.predict(val_Z).flatten()
# val_pred = (val_pred_prob >= 0.5).astype(int)

# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# print("\nValidation Performance:")
# print("AUC Score   :", roc_auc_score(val_Y, val_pred_prob))
# print("Accuracy    :", accuracy_score(val_Y, val_pred))
# print("Precision   :", precision_score(val_Y, val_pred))
# print("Recall      :", recall_score(val_Y, val_pred))
# print("F1 Score    :", f1_score(val_Y, val_pred))

# # ==== FINAL PREDICTION ON TEST SET ====
# test_pred_prob = best_model.predict(test_Z).flatten()
# test_data['is_turkey'] = -1.0
# test_data.loc[valid_idx, 'is_turkey'] = test_pred_prob

# # ==== SAVE TO CSV ====
# test_data.loc[valid_idx, ['vid_id', 'is_turkey']].to_csv('mlp_tuned_result.csv', index=False)


Trial 30 Complete [00h 00m 11s]
val_auc: 0.9812867939472198

Best val_auc So Far: 0.9866834878921509
Total elapsed time: 00h 04m 48s
Best Hyperparameters:
units1:    64
dropout1:  0.1
units2:    96
dropout2:  0.2
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


  saveable.load_own_variables(weights_store.get(inner_path))



Validation Performance:
AUC Score   : 0.9876647042332491
Accuracy    : 0.9246861924686193
Precision   : 0.9298245614035088
Recall      : 0.9137931034482759
F1 Score    : 0.9217391304347826
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
