# Phase 2: MLP - PyTorch

## Import các thư viện cần thiết

In [373]:
import json
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import optuna

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Đọc dữ liệu huấn luyện và dữ liệu test

In [374]:
path_to_data="."

In [375]:
with open(path_to_data+'/train.json','r') as f:
    train_data = json.load(f)
with open(path_to_data+'/test.json','r') as f:
    test_data = json.load(f)

# Chuyển  dữ liệu thành DataFrame để dễ quan sát
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

In [376]:
# In ra dữ liệu huấn luyện
train_df.head()

Unnamed: 0,audio_embedding,is_turkey,vid_id,end_time_seconds_youtube_clip,start_time_seconds_youtube_clip
0,"[[172, 34, 216, 110, 208, 46, 95, 66, 161, 125...",0,kDCk3hLIVXo,70,60
1,"[[169, 20, 165, 102, 205, 62, 110, 103, 211, 1...",1,DPcGzqHoo7Y,40,30
2,"[[148, 8, 138, 60, 237, 48, 121, 108, 145, 177...",1,7yM63MTHh5k,240,230
3,"[[151, 0, 162, 88, 171, 71, 47, 90, 179, 190, ...",1,luG3RmUAxxM,520,510
4,"[[162, 17, 187, 111, 211, 105, 92, 67, 203, 15...",0,PIm3cjxTpOk,10,0


In [377]:
# In ra dữ liệu kiểm tra
test_df.head()

Unnamed: 0,audio_embedding,vid_id,end_time_seconds_youtube_clip,start_time_seconds_youtube_clip
0,"[[177, 20, 226, 132, 198, 81, 111, 59, 132, 18...",pyKh38FXD3E,10,0
1,"[[169, 21, 204, 161, 195, 72, 60, 39, 152, 184...",THhP1idrWXA,40,30
2,"[[165, 13, 198, 141, 199, 81, 173, 54, 119, 11...",jsw3T6GY2Nw,40,30
3,"[[167, 18, 188, 159, 198, 63, 156, 36, 179, 22...",nFkXTMHcjMU,24,14
4,"[[178, 32, 181, 100, 198, 46, 82, 83, 136, 227...",Au8g9kAlrLQ,40,30


## Phát triển mô hình

### Xử lý dữ liệu trước khi huấn luyện

In [378]:
def combined_embeddings(embeddings): # Hàm dùng để tính toán trên embeddings
    X= np.array(embeddings)
    mean= np.mean(X, axis=0)
    return mean

In [379]:
train_X = np.stack(train_df['audio_embedding'].apply(combined_embeddings)) # Lấy trung bình của mỗi cột trong các audio_embedding
train_Y = train_df['is_turkey'].values # Lấy nhãn của dữ liệu huấn luyện

valid_idx = test_df['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
test_X = np.stack(test_df['audio_embedding'].apply(combined_embeddings)) # Lấy trung bình của mỗi cột trong các audio_embedding

In [380]:
scaler= StandardScaler()
# Chuẩn hóa dữ liệu

train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)
# Chia dữ liệu huấn luyện thành tập huấn luyện và tập kiểm tra

X_train, X_val, y_train, y_val = train_test_split(train_X, train_Y, test_size=0.15, random_state=42)


### Chuyển dataset thành kiểu dữ liệu phù hợp với PyTorch

In [381]:
class MyDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)  # Chuyển đổi dữ liệu thành tensor
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

In [382]:
train_ds= MyDataset(X_train, y_train)
val_ds = MyDataset(X_val, y_val)
test_ds = MyDataset(test_X)

BATCH_SIZE = 64
train_loader= DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

### Xây dựng mô hình

In [383]:
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.3):
        super().__init__()
        self.linear = nn.Linear(dim, dim)
        self.bn = nn.BatchNorm1d(dim)
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = self.linear(x)
        out = self.bn(out)
        out = self.dropout(out)
        return self.relu(out + x)  # Residual connection
        
class ResidualMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, dropout=0.3):
        super().__init__()
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
        )
        self.res_block = ResidualBlock(hidden_dim, dropout)
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.input_layer(x)
        x = self.res_block(x)
        return self.output_layer(x)

In [384]:
  # Khởi tạo mô hình với kích thước đầu vào


### EarlyStopping

In [385]:
class EarlyStopping:
    def __init__(self,patience=5,delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss= None
        self.counter = 0
        self.early_stop = False
    def __call__(self,val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

## Huấn luyện mô hình

In [386]:
num_epochs = 100
learning_rate = 0.001  # Tính trọng số cho lớp dương

In [387]:
def train_model(hiddens, dropouts,learning_rates,weight_decays):
    best_model= None
    criterion = nn.BCELoss()
    for hidden in hiddens:
        for dropout in dropouts:
            for lr in learning_rates:
                for weight_decay in weight_decays:
                    print(f"Training with hidden={hidden}, dropout={dropout}, learning_rate={lr}, weight_decay={weight_decay}")
                    accs=[]
                    precs=[]
                    recs=[]
                    f1s=[]
                    roc_aucs=[]

                    for iter in range(10):
                    
                        # Khởi tạo mô hình với các tham số hiện tại
                        model = ResidualMLP(input_dim=X_train.shape[1], hidden_dim=hidden, dropout=dropout)
                        optimizer=optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)
                        early_stopping = EarlyStopping(patience=5)
                        for epoch in range(num_epochs):
                            model.train()
                            total_loss = 0
                            for X_batch, y_batch in train_loader:
                                optimizer.zero_grad()
                                output = model(X_batch)
                                loss = criterion(output, y_batch)
                                loss.backward()
                                optimizer.step()
                                total_loss += loss.item()

                            model.eval()
                            val_loss=0
                            with torch.no_grad():
                                for X_batch, y_batch in val_loader:
                                    val_output = model(X_batch)
                                    loss = criterion(val_output, y_batch)
                                    val_loss += loss.item()
                            val_loss /= len(val_loader)
                            early_stopping(val_loss)
                            if early_stopping.early_stop:
                                break

                        model.eval()
                        y_true = []
                        y_pred = []
                        y_prob = []

                        with torch.no_grad():
                            for X_batch, y_batch in val_loader:
                                output = model(X_batch)
                                probs = output.squeeze().numpy()
                                preds = (output > 0.5).int().squeeze().numpy()
                                y_true.extend(y_batch.squeeze().numpy())
                                y_pred.extend(preds)
                                y_prob.extend(probs)

                        # Metrics
                        roc_auc = roc_auc_score(y_true, y_prob)
                        acc = accuracy_score(y_true, y_pred)
                        prec = precision_score(y_true, y_pred)
                        rec = recall_score(y_true, y_pred)
                        f1 = f1_score(y_true, y_pred)

                        accs.append(acc)
                        precs.append(prec)
                        recs.append(rec)
                        f1s.append(f1)
                        roc_aucs.append(roc_auc)
                    
                    avg_acc = np.mean(accs)
                    avg_prec = np.mean(precs)
                    avg_rec = np.mean(recs)
                    avg_f1 = np.mean(f1s)
                    avg_roc_auc = np.mean(roc_aucs)
                    print(f"ROC AUC: {avg_roc_auc:.4f}, Accuracy: {avg_acc:.4f}, Precision: {avg_prec:.4f}, Recall: {avg_rec:.4f}, F1 Score: {avg_f1:.4f}")

                    if best_model is None or (avg_roc_auc > best_model['roc_auc'] and (avg_acc+ avg_prec + avg_rec + avg_f1) / 4 > (best_model['accuracy'] + best_model['precision'] + best_model['recall'] + best_model['f1_score']) / 4):
                        best_model = {
                            'model': model,
                            'hidden': hidden,
                            'dropout': dropout,
                            'learning_rate': lr,
                            'weight_decay': weight_decay,
                            'roc_auc': avg_roc_auc,
                            'accuracy': avg_acc,
                            'precision': avg_prec,
                            'recall': avg_rec,
                            'f1_score': avg_f1
                            }
    print(f"Best model found with ROC AUC: {best_model['roc_auc']:.4f}, hidden={best_model['hidden']}, dropout={best_model['dropout']}, learning_rate={best_model['learning_rate']}, weight_decay={best_model['weight_decay']}")
    return best_model
                

In [388]:
hiddens=[32,64,128]
dropouts=[0,0.05,0.1]
learning_rates=[0.05]
weight_decays=[0.01]

In [389]:
best_model = train_model(hiddens, dropouts, learning_rates, weight_decays)

Training with hidden=32, dropout=0, learning_rate=0.05, weight_decay=0.01
ROC AUC: 0.9763, Accuracy: 0.9089, Precision: 0.9269, Recall: 0.8762, F1 Score: 0.8995
Training with hidden=32, dropout=0.05, learning_rate=0.05, weight_decay=0.01
ROC AUC: 0.9779, Accuracy: 0.9111, Precision: 0.9310, Recall: 0.8762, F1 Score: 0.9016
Training with hidden=32, dropout=0.1, learning_rate=0.05, weight_decay=0.01
ROC AUC: 0.9762, Accuracy: 0.9100, Precision: 0.9256, Recall: 0.8798, F1 Score: 0.9011
Training with hidden=64, dropout=0, learning_rate=0.05, weight_decay=0.01
ROC AUC: 0.9767, Accuracy: 0.9044, Precision: 0.9270, Recall: 0.8655, F1 Score: 0.8935
Training with hidden=64, dropout=0.05, learning_rate=0.05, weight_decay=0.01
ROC AUC: 0.9762, Accuracy: 0.9089, Precision: 0.9246, Recall: 0.8786, F1 Score: 0.8996
Training with hidden=64, dropout=0.1, learning_rate=0.05, weight_decay=0.01
ROC AUC: 0.9776, Accuracy: 0.9172, Precision: 0.9383, Recall: 0.8821, F1 Score: 0.9084
Training with hidden=128

In [390]:
print("Metrics of the best model:")
print(f"ROC AUC: {best_model['roc_auc']:.4f}, Accuracy: {best_model['accuracy']:.4f}, Precision: {best_model['precision']:.4f}, Recall: {best_model['recall']:.4f}, F1 Score: {best_model['f1_score']:.4f}")

Metrics of the best model:
ROC AUC: 0.9779, Accuracy: 0.9111, Precision: 0.9310, Recall: 0.8762, F1 Score: 0.9016


In [391]:
# criterion = nn.BCELoss() # Sử dụng hàm mất mát nhị phân với logits
# optimizer = optim.Adam(model.parameters(), lr=learning_rate,weight_decay=0.1)
# early_stopping = EarlyStopping(patience=5)  # Khởi tạo EarlyStopping

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0
#     for X_batch, y_batch in train_loader:
#         optimizer.zero_grad()
#         output = model(X_batch)
#         loss = criterion(output, y_batch)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

#     model.eval()
#     val_loss=0
#     with torch.no_grad():
#         for X_batch, y_batch in val_loader:
#             val_output = model(X_batch)
#             loss = criterion(val_output, y_batch)
#             val_loss += loss.item()
#     val_loss /= len(val_loader)
#     print(f"Validation Loss: {val_loss:.4f}")
#     early_stopping(val_loss)
#     if early_stopping.early_stop:
#         break


In [392]:
# model.eval()
# y_true = []
# y_pred = []
# y_prob = []

# with torch.no_grad():
#     for X_batch, y_batch in val_loader:
#         output = model(X_batch)
#         probs = output.squeeze().numpy()
#         preds = (output > 0.5).int().squeeze().numpy()
#         y_true.extend(y_batch.squeeze().numpy())
#         y_pred.extend(preds)
#         y_prob.extend(probs)

# # Metrics
# acc = accuracy_score(y_true, y_pred)
# prec = precision_score(y_true, y_pred)
# rec = recall_score(y_true, y_pred)
# f1 = f1_score(y_true, y_pred)
# roc_auc = roc_auc_score(y_true, y_prob)

# print(f"ROC AUC: {roc_auc}")
# print(f"Accuracy: {acc}")
# print(f"Precision: {prec}")
# print(f"Recall: {rec}")
# print(f"F1-Score: {f1}")


### Dự đoán trên tập test

In [393]:
model= best_model['model']  # Sử dụng mô hình tốt nhất từ quá trình huấn luyện
model.eval()

with torch.no_grad():
    probs=model(test_ds.X).squeeze().tolist()


for item,prob in zip(test_df['vid_id'], probs):
    test_df.loc[test_df['vid_id'] == item, 'is_turkey'] = round(prob,6)

test_df[['vid_id', 'is_turkey']].to_csv('result.csv', index=False)