# Huấn luyện với tách data frame

# Import thư viện

In [36]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,random_split,TensorDataset
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import timm 
import tqdm as notebook_tqdm
import torchaudio
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

In [61]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import sys
import json

In [38]:
print("System version: ", sys.version)
print("PyTorch version: ", torch.__version__)
print("Torchvision version: ", torchvision.__version__)
print("Numpy version: ", np.__version__)
print("Pandas version: ", pd.__version__)

System version:  3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]
PyTorch version:  2.7.0+cpu
Torchvision version:  0.22.0+cpu
Numpy version:  2.1.3
Pandas version:  2.2.3


# Đọc dữ liệu

In [39]:
path_to_data='../../dataset'

In [40]:
with open(path_to_data+'/train.json', 'r') as f:
    train_val_data = json.load(f)
with open(path_to_data+'/test.json', 'r') as f:
    test_data = json.load(f)

In [41]:
train_val_data=pd.DataFrame(train_val_data)
test_data=pd.DataFrame(test_data)

In [60]:
test_data.shape

(1196, 4)

# Tách list vector thành các vector/ mẫu mới

In [None]:
def expand_audio_embeddings(data):
    expanded_rows = []
    for idx, row in data.iterrows():
        embeddings = row['audio_embedding']
        for emb in embeddings:
            new_row = row.copy()
            new_row['audio_embedding'] = emb
            expanded_rows.append(new_row)
    expanded_data = pd.DataFrame(expanded_rows)
    expanded_data.reset_index(drop=True, inplace=True)
    return expanded_data

In [43]:
expanded_train_val_data = expand_audio_embeddings(train_val_data)
expanded_test_data = expand_audio_embeddings(test_data)

In [59]:
expanded_train_val_data['is_turkey'].value_counts()

is_turkey
0    6954
1    4841
Name: count, dtype: int64

In [44]:
expanded_train_val_data.head(5)

Unnamed: 0,audio_embedding,is_turkey,vid_id,end_time_seconds_youtube_clip,start_time_seconds_youtube_clip
0,"[172, 34, 216, 110, 208, 46, 95, 66, 161, 125,...",0,kDCk3hLIVXo,70,60
1,"[171, 39, 199, 121, 238, 62, 59, 61, 170, 146,...",0,kDCk3hLIVXo,70,60
2,"[169, 33, 200, 97, 210, 22, 73, 51, 169, 129, ...",0,kDCk3hLIVXo,70,60
3,"[180, 39, 218, 118, 213, 73, 80, 43, 160, 147,...",0,kDCk3hLIVXo,70,60
4,"[166, 31, 204, 134, 211, 59, 62, 73, 187, 167,...",0,kDCk3hLIVXo,70,60


# Tính vector trung bình nếu dùng list vector

In [45]:
def combined_embeddings(embeddings): 
    X= np.array(embeddings)
    return np.mean(X, axis=0)

# Trích thuộc tính và đặc trưng

In [46]:
X_train_val=pd.DataFrame(expanded_train_val_data['audio_embedding'])
Y_train_val=pd.DataFrame(expanded_train_val_data['is_turkey'])
X_test=pd.DataFrame(expanded_test_data['audio_embedding'])

In [47]:
audio_features = pd.DataFrame(X_train_val['audio_embedding'].tolist(), columns=[f'feature_{i}' for i in range(128)])
X_test_features = pd.DataFrame(X_test['audio_embedding'].tolist(), columns=[f'feature_{i}' for i in range(128)])

# Chuyển đổi dữ liệu chuẩn bị học

In [48]:
scaler= StandardScaler()

train_X = scaler.fit_transform(audio_features)
test_X = scaler.transform(X_test_features)

# Chuyển dataset thành kiểu dữ liệu phù hợp cho pytorch

In [49]:
torch_X_train= torch.tensor(train_X, dtype=torch.float32)
torch_Y_train= torch.tensor(Y_train_val['is_turkey'].values, dtype=torch.float32)
torch_X_test= torch.tensor(test_X, dtype=torch.float32)


In [50]:
dataset=TensorDataset(torch_X_train, torch_Y_train)
train_size=int(0.8 * len(dataset))
val_size=len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Training model

## Chuyển thành DataLoader

In [51]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

## Cấu hình mạng neuron

In [52]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim=128, hidden_dim1=128, hidden_dim2=64,hidden_dim3=32, dropout=0.2):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim1),
            nn.BatchNorm1d(hidden_dim1),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim1, hidden_dim2),
            nn.BatchNorm1d(hidden_dim2),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_dim2, hidden_dim3),
            nn.BatchNorm1d(hidden_dim3),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_dim3, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.classifier(x)


## Early Stopping

In [53]:
class EarlyStopping:
    def __init__(self,patience=5,delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss= None
        self.counter = 0
        self.early_stop = False
    def __call__(self,val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [54]:
model = MLPClassifier()
loss_fn = nn.BCELoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3,weight_decay=0.001)
early_stopping = EarlyStopping(patience=5)

### Huấn luyện

In [55]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_Y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X).squeeze()  # [batch_size]
        loss = loss_fn(output, batch_Y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            val_output = model(X_batch).squeeze()
            loss = loss_fn(val_output, y_batch.float())
            val_loss += loss.item()
    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss:.4f}")
    early_stopping(val_loss)
    if early_stopping.early_stop:
        break


Epoch 1, Loss: 28.2721
Validation Loss: 0.2692
Epoch 2, Loss: 19.2360
Validation Loss: 0.2384
Epoch 3, Loss: 16.9694
Validation Loss: 0.2353
Epoch 4, Loss: 16.0874
Validation Loss: 0.2301
Epoch 5, Loss: 15.0459
Validation Loss: 0.2194
Epoch 6, Loss: 14.2060
Validation Loss: 0.2208
Epoch 7, Loss: 13.3737
Validation Loss: 0.2256
Epoch 8, Loss: 12.9425
Validation Loss: 0.2175
Epoch 9, Loss: 11.9787
Validation Loss: 0.2173
Epoch 10, Loss: 11.2525
Validation Loss: 0.2229
Epoch 11, Loss: 11.2759
Validation Loss: 0.2286
Epoch 12, Loss: 10.3326
Validation Loss: 0.2276
Epoch 13, Loss: 9.6408
Validation Loss: 0.2390
Epoch 14, Loss: 9.7163
Validation Loss: 0.2333


### Đánh giá chỉ số

In [56]:
model.eval()
all_labels = []
all_preds = []
all_probs = []

with torch.no_grad():
    for batch_X, batch_Y in val_loader:
        output = model(batch_X).squeeze()
        probs = output.detach().cpu().numpy()
        preds = (output >= 0.5).long().cpu().numpy()
        all_labels.extend(batch_Y.cpu().numpy())
        all_preds.extend(preds)
        all_probs.extend(probs)

accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
auc = roc_auc_score(all_labels, all_probs)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


AUC: 0.9691
Accuracy: 0.9224
Precision: 0.9085
Recall: 0.9020
F1-score: 0.9052


# Dự đoán kết quả

In [57]:
model.eval()
with torch.no_grad():
    outputs = model(torch_X_test).squeeze()

In [58]:
result_df = pd.DataFrame({
    'vid_id': test_data['vid_id'],
    'is_turkey': np.round(outputs.numpy()[:len(test_data)], 6)
})
result_df.to_csv('result.csv', index=False)