In [1]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import argparse
import os
import wandb
from scipy.stats import pearsonr
import audiofile
from transformers import Wav2Vec2Model



In [2]:
# 글로벌 변수 선언
LANG = 'en'
LABEL_TYPE1 = 'pron'
LABEL_TYPE2 = 'prosody'
DIR_LIST = 'data_list'
DEVICE = 'cuda'
DIR_MODEL = 'model'

MODEL_TYPE = 'cnn+lstm'
BASE_DIM = 1024
#lstm, mlp
MLP_HIDDEN = 512
NUM_LAYERS = 4


LR = 0.01
EPOCHS = 200
BATCH_SIZE = 256
PATIENCE = 20
NUM_WORKERS = 10
AUDIO_LEN_MAX = 200000

BASE_MODEL = None
DIR_DATA = None
DIR_RESUME = None
SEED = 42

In [3]:
# Random seed 고정
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE == 'cuda':
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
# class CNN_LSTM_RegressionModel(nn.Module):
#     def __init__(self, input_dim=1024, hidden_dim=128, num_layers=2, dropout=0.3):
#         super(CNN_LSTM_RegressionModel, self).__init__()
#         self.cnn = nn.Sequential(
#             nn.Conv2d(1, 32, kernel_size=(3, 3), padding=(1, 1)),
#             nn.ReLU(),
#             nn.MaxPool2d(kernel_size=(2, 2)),
#             nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1)),
#             nn.ReLU(),
#             nn.MaxPool2d(kernel_size=(2, 2))
#         )
#         # Adjust the LSTM input dimension based on the output of CNN
#         cnn_output_dim = 64 * (input_dim // 16)
#         self.lstm = nn.LSTM(cnn_output_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
#         self.fc = nn.Linear(hidden_dim, 1)

#     def forward(self, x):
#         batch_size, feature_dim = x.size()
#         x = x.view(batch_size, 1, 32, -1)  # Adjust to [batch_size, 1, height, width] format
#         x = self.cnn(x)
#         x = x.view(batch_size, -1)
#         x = x.unsqueeze(1)  # [batch_size, 1, cnn_output_dim]
#         x, _ = self.lstm(x)
#         x = x[:, -1, :]  # Get the last output of the LSTM
#         x = self.fc(x)
#         return x

class CNN_LSTM_RegressionModel(nn.Module):
    def __init__(self, input_dim=1024, hidden_dim=MLP_HIDDEN, num_layers=NUM_LAYERS, dropout=0.3):
        super(CNN_LSTM_RegressionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), padding=(1, 1)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2)),
            nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2))
        )
        # Adjust the LSTM input dimension based on the output of CNN
        cnn_output_dim = 64 * (input_dim // 16)
        self.lstm = nn.LSTM(cnn_output_dim, MLP_HIDDEN, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(MLP_HIDDEN, 1)

    def forward(self, x):
        batch_size, feature_dim = x.size()
        x = x.view(batch_size, 1, 32, -1)  # Adjust to [batch_size, 1, height, width] format
        x = self.cnn(x)
        x = x.view(batch_size, -1)
        x = x.unsqueeze(1)  # [batch_size, 1, cnn_output_dim]
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Get the last output of the LSTM
        x = self.fc(x)
        return x

In [5]:
def open_file(filename):
    with open(filename) as f:
        return f.readlines()

def feat_extraction(data_type):
    ''' wav2vec2 feature extraction part '''
    fname_list = os.path.join(DIR_LIST, f'lang_{LANG}', f'{data_type}_list.txt')
    fname_list = open_file(fname_list)
    features = []
    labels = []
    for filename in fname_list:
        filename = filename.strip()
        audio_file = os.path.join(DIR_DATA, filename + '.wav')
        audio_data, sampling_rate = audiofile.read(audio_file)
        audio_data = audio_data[:AUDIO_LEN_MAX]
        
        # wav2vec2 model
        model = Wav2Vec2ForCTC.from_pretrained(BASE_MODEL)
        input_values = model.feature_extractor(audio_data, sampling_rate=sampling_rate, return_tensors='pt').input_values
        with torch.no_grad():
            features.append(model(input_values).logits.numpy().squeeze())

        # load label
        label_file = os.path.join(DIR_DATA, filename + f'_{LABEL_TYPE1}_{LABEL_TYPE2}.npy')
        labels.append(np.load(label_file))

    return np.array(features), np.array(labels)

In [6]:
def load_or_extract_features(data_type):
    """Load features from file if they exist, otherwise extract them and save to file."""
    feature_dir = os.path.join("Datasets_full_list", f"lang_{LANG}")
    os.makedirs(feature_dir, exist_ok=True)
    feature_file = os.path.join(feature_dir, f"{LABEL_TYPE1}_{data_type}.npz")

    if os.path.exists(feature_file): # 파일이 존재하면 로드
        print(f"Loading features from {feature_file}")
        data = np.load(feature_file)
        feat_X, feat_Y = data["X"], data["Y"]
    else: # 존재하지 않으면 추출
        print(f"Extracting features and saving to {feature_file}")
        feat_X, feat_Y = feat_extraction(data_type)
        np.savez(feature_file, X=feat_X, Y=feat_Y)

    print(f"wav2vec2 feature {data_type}, {feat_X.shape}, {feat_Y.shape}")
    return feat_X, feat_Y

In [7]:
def load_data():
    trn_feat_x, trn_feat_y = load_or_extract_features('trn')  # feature extraction or loading for training data
    val_feat_x, val_feat_y = load_or_extract_features('val')  # feature extraction or loading for validation data
    test_feat_x, test_feat_y = load_or_extract_features('test')  # feature extraction or loading for test data

    tr_dataset = TensorDataset(torch.tensor(trn_feat_x), torch.tensor(trn_feat_y))
    val_dataset = TensorDataset(torch.tensor(val_feat_x), torch.tensor(val_feat_y))
    test_dataset = TensorDataset(torch.tensor(test_feat_x), torch.tensor(test_feat_y))

    train_dataloader = DataLoader(tr_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
    print(f"wav2vec2 feature LOADED!!") 
    return train_dataloader, val_dataloader, test_dataloader


In [8]:
def calculate_pearsonr(labels, preds):
    if np.all(labels == labels[0]) or np.all(preds == preds[0]):
        return 0  # If all values are the same, set PCC to 0
    else:
        return pearsonr(labels, preds)[0]

In [9]:
def train():
    global BASE_MODEL
    # Initialize wandb
    config = {
        "lang": LANG,
        "label_type1": LABEL_TYPE1,
        "label_type2": LABEL_TYPE2,
        "dir_list": DIR_LIST,
        "audio_len_max": AUDIO_LEN_MAX,
        "device": DEVICE,
        "num_workers": NUM_WORKERS,
        "base_model": BASE_MODEL,
        "dir_model": DIR_MODEL,
        "dir_data": DIR_DATA,
        "dir_resume": DIR_RESUME,
        "base_dim": BASE_DIM,
        "mlp_hidden": MLP_HIDDEN,
        "lr": LR,
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "patience": PATIENCE,
        "model_type": MODEL_TYPE
    }
    wandb.init(project="model_study_pron+articulation", config=config)
    dir_save_model = f'{DIR_MODEL}/lang_{LANG}'
    os.makedirs(dir_save_model, exist_ok=True)

    if LANG == 'en':
        base_model = 'facebook/wav2vec2-large-robust-ft-libri-960h'
    print(f'base wav2vec2 model: {base_model}')
    
    train_dataloader, val_dataloader, test_dataloader = load_data()

    if MODEL_TYPE =='transformer':
    # Model hyperparameters
        hidden_dim = 8   # Transformer hidden dimension
        num_layers = 2     # Number of Transformer layers
        num_heads = 1      # Number of attention heads
        output_dim = 1     # Regression output dimension (1 for a single value)
    # elif MODEL_TYPE =='cnn+lstm':
    #     hidden_dim = 128   # LSTM hidden dimension
    #     num_layers = 2     # Number of LSTM layers
    #     dropout = 0.3      # Dropout rate

    net = CNN_LSTM_RegressionModel(input_dim=BASE_DIM).to(DEVICE)

    if DIR_RESUME is not None:
        dir_resume_model = os.path.join(DIR_RESUME, f'lang_{LANG}', f'{LABEL_TYPE1}_{LABEL_TYPE2}_checkpoint.pt')
        net.load_state_dict(torch.load(dir_resume_model, map_location=DEVICE))
        print(f'Training a model from {dir_resume_model}')
    else:
        print(f'Training a model from scratch')

    optimizer = torch.optim.Adam(net.parameters(), lr=LR)  # training optimizer
    loss_func = torch.nn.MSELoss()  # MSE loss for regression task

    # 학습률 스케줄러 추가
    steps_per_epoch = len(train_dataloader)
    total_steps = EPOCHS * steps_per_epoch

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=LR,
        total_steps=total_steps,
        pct_start=0.1,  # 웜업 비율 (첫 10% 단계에서 학습률 증가)
        anneal_strategy='cos',  # 코사인 에닐링
        cycle_momentum=False  # Adam에서는 False로 설정
    )
    
    eval_best_pcc = -9
    early_stop_counter = 0
    stop_flag = False

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    for epoch in range(EPOCHS):
        net.train()
        train_loss = 0
        for train_data in train_dataloader:
            feat_x, feat_y = train_data
            optimizer.zero_grad()

            prediction = net(feat_x.to(DEVICE))
            loss = loss_func(prediction, feat_y.to(DEVICE))
            
            # Gradient Clipping
            torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)

            loss.backward()
            optimizer.step()
            scheduler.step()  # 스케줄러 단계 업데이트
            train_loss += loss.item()

        train_loss /= len(train_dataloader)
        current_lr = optimizer.param_groups[0]['lr']  # 현재 학습률 가져오기
        wandb.log({"epoch": epoch, "train_loss": train_loss, "learning_rate": current_lr})  # 학습률을 로그에 추가

        # Validation
        net.eval()
        val_loss = 0
        val_labels = []
        val_preds = []
        with torch.no_grad():
            for val_data in val_dataloader:
                feat_x, feat_y = val_data

                val_labels.extend(feat_y.tolist())
                prediction = net(feat_x.to(DEVICE))
                val_preds.extend(prediction.cpu().tolist())

                loss = loss_func(prediction, feat_y.to(DEVICE))
                val_loss += loss.item()

        val_loss /= len(val_dataloader)
        val_labels = np.array(val_labels).squeeze()
        val_preds = np.clip(np.array(val_preds).squeeze(), 0, 5)

        eval_pcc = calculate_pearsonr(val_labels, val_preds)
        wandb.log({"epoch": epoch, "val_loss": val_loss})

        # Testing
        net.eval()
        test_labels = []
        test_preds = []
        with torch.no_grad():
            for test_data in test_dataloader:
                feat_x, feat_y = test_data
                
                test_labels.extend(feat_y.tolist())
                prediction = net(feat_x.to(DEVICE))
                test_preds.extend(prediction.cpu().tolist())

        test_labels = np.array(test_labels).squeeze()
        test_preds = np.clip(np.array(test_preds).squeeze(), 0, 5)

        test_pcc = calculate_pearsonr(test_labels, test_preds)
        
        print(f'epoch {epoch},train loss": {train_loss}, eval_pcc: {eval_pcc}, test_pcc: {test_pcc}')
        wandb.log({"epoch": epoch,"train loss": {train_loss}, "eval_pcc": eval_pcc, "test_pcc": test_pcc})

        # Early stopping
        if eval_pcc > eval_best_pcc and not stop_flag:
            eval_best_pcc = eval_pcc
            test_best_pcc = test_pcc
            early_stop_counter = 0
            torch.save(net.state_dict(), os.path.join(dir_save_model, f'{LABEL_TYPE1}_{LABEL_TYPE2}_{MODEL_TYPE}_checkpoint.pt'))
        else:
            early_stop_counter += 1

        if early_stop_counter > PATIENCE and not stop_flag:
            print("Early stopping triggered.")
            break

    print(f'Final Test PCC: {test_best_pcc}')
    wandb.log({"final_test_pcc": test_best_pcc})
    wandb.finish()

if __name__ == "__main__":
    train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcoldbrew[0m ([33mx_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


base wav2vec2 model: facebook/wav2vec2-large-robust-ft-libri-960h
Loading features from Datasets_full_list/lang_en/pron_trn.npz
wav2vec2 feature trn, (64748, 1024), (64748, 1)
Loading features from Datasets_full_list/lang_en/pron_val.npz
wav2vec2 feature val, (17958, 1024), (17958, 1)
Loading features from Datasets_full_list/lang_en/pron_test.npz
wav2vec2 feature test, (8813, 1024), (8813, 1)
wav2vec2 feature LOADED!!
Training a model from scratch
Using device: cuda


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


epoch 0,train loss": 1.4039912303917021, eval_pcc: 0.2787077654781437, test_pcc: 0.27359563157537303
epoch 1,train loss": 0.7997416707838005, eval_pcc: 0.3787144069304195, test_pcc: 0.37076371866327396
epoch 2,train loss": 0.8002692102914742, eval_pcc: 0.4027585246019682, test_pcc: 0.39069212412895593
epoch 3,train loss": 0.6535092359003813, eval_pcc: 0.5764474338600057, test_pcc: 0.5687934499672395
epoch 4,train loss": 0.5471796608960675, eval_pcc: 0.6148732506293988, test_pcc: 0.6073116044850868
epoch 5,train loss": 0.5188709509231356, eval_pcc: 0.6424666944005856, test_pcc: 0.633396571167901
epoch 6,train loss": 0.4947819580202517, eval_pcc: 0.6633757717609917, test_pcc: 0.6569927682141887
epoch 7,train loss": 0.46986798241204425, eval_pcc: 0.6724403418016263, test_pcc: 0.6659701545968604
epoch 8,train loss": 0.4682197033652204, eval_pcc: 0.6727839023790146, test_pcc: 0.6692036038863377
epoch 9,train loss": 0.4634464856664183, eval_pcc: 0.678227590451191, test_pcc: 0.674043067157036

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval_pcc,▁▃▃▆▇▇██████████████████████████████████
final_test_pcc,▁
learning_rate,▁▁▁▂▂▃▃▄▅▆▆▇▇███████████████████████████
test_pcc,▁▃▃▆▇▇██████████████████████████████████
train_loss,█▄▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▇█▇▃▃▂▂▂▂▂▂▂▁▁▂▁▂▁▁▁▁▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▂

0,1
epoch,49.0
eval_pcc,0.69517
final_test_pcc,0.69133
learning_rate,0.00933
test_pcc,0.6917
train_loss,0.4429
val_loss,0.44944


In [10]:
# *transformer train command* 
#python train_transformer.py --lang='en' --label_type1='pron' --label_type2='articulation'  --dir_list='/home/coldbrew/fluent/01.발음평가모델/1.모델소스코드/datasets_full_list' --epochs=200 --patience=20 --batch_size=256 --dir_model='model_transformer' --model_type='transformer'