In [1]:
import joblib
import os
import numpy as np
from scipy.stats import pearsonr

class Args:
    dir_model = 'model_svr'
    lang = 'en'
    label_type1 = 'pron'
    label_type2 = 'prosody'
    dir_list = ''
    dir_list='/mnt/f/fluent/AI_exp_repo/datasets_list'
    base_dim = 1024
    output_dim = 1
    audio_len_max = 200000
    device ='cuda'
args = Args()

# Load the saved model and scaler
model = joblib.load(os.path.join(args.dir_model, 'svr_model.joblib'))
scaler = joblib.load(os.path.join(args.dir_model, 'scaler.joblib'))

def load_or_extract_features(args, data_type):
    """Load features from file if they exist, otherwise extract them and save to file."""
    feature_dir = os.path.join("datasets_full_list_feature_extracted", f"lang_{args.lang}")
    os.makedirs(feature_dir, exist_ok=True)
    feature_file = os.path.join(feature_dir, f"{args.label_type1}_{data_type}.npz")
    print(f"feature_file : {feature_file}")
    
    if os.path.exists(feature_file):
        print(f"Loading features from {feature_file}")
        data = np.load(feature_file)
        feat_X, feat_Y = data["X"], data["Y"]
    else:
        print(f"Extracting features and saving to {feature_file}")
        feat_X, feat_Y = feat_extraction(args, data_type)
        np.savez(feature_file, X=feat_X, Y=feat_Y)

    print(f"wav2vec2 feature {data_type}, {feat_X.shape}, {feat_Y.shape}")
    return feat_X, feat_Y
    
def open_file(filename):
    with open(filename) as f:
        return f.readlines()


In [2]:
def feat_extraction(args, data_type):
    ''' wav2vec2 feature extraction part '''

    fname_list = os.path.join(args.dir_list, f'lang_{args.lang}', f'{args.label_type1}_{data_type}.list')
    filelist = open_file(fname_list)
    data_len = len(filelist)

    feat_X = np.zeros((data_len, args.base_dim), dtype=np.float32)  # features
    feat_Y = np.zeros((data_len, 1), dtype=np.float32)  # labels

    model = Wav2Vec2ForCTC.from_pretrained(args.base_model).to(args.device)  # load wav2vec2 model

    for idx, line in enumerate(filelist):

        try:
            fname, score1, score2, text = line.split('\t')  # wavfile path, articulation score, prosody score, script
        except:
            data_len -= 1  # if list file format is wrong, we exclude it
            continue

        try:
            x, sr = audiofile.read(fname)
        except:
            data_len -= 1
            continue

        if args.label_type2 == 'articulation':
            score = score1
        else:
            score = score2

        if x.shape[-1] > args.audio_len_max:
            x = x[:args.audio_len_max]  # if audio file is long, cut it to audio_len_max

        x = torch.tensor(x, device=args.device).reshape(1, -1)
        output = model(x, output_attentions=True, output_hidden_states=True, return_dict=True)  # wav2vec2 model output

        feat_x = output.hidden_states[-1]  # last hidden state of wav2vec2, (1, frame, 1024)
        feat_x = torch.mean(feat_x, axis=1).cpu().detach().numpy()  # pooled output along time axis, (1, 1024)

        feat_X[idx, :] = feat_x
        feat_Y[idx, 0] = float(score)

    print(f"wav2vec2 feature extraction {data_type}, {feat_X[:data_len, :].shape}, {feat_Y[:data_len, :].shape}")

    return feat_X[:data_len, :], feat_Y[:data_len, :]

In [3]:
# Load or extract features for the dataset (e.g., 'test')
test_feat_x, test_feat_y = load_or_extract_features(args, 'test')

# Scale the features using the loaded scaler
test_feat_x = scaler.transform(test_feat_x)

# Make predictions using the loaded model
test_preds = model.predict(test_feat_x)

# Calculate the Pearson correlation coefficient
pearson_corr, _ = pearsonr(test_feat_y, test_preds)

print(f"Pearson Correlation Coefficient: {pearson_corr}")

feature_file : datasets_full_list_feature_extracted/lang_en/pron_test.npz
Loading features from datasets_full_list_feature_extracted/lang_en/pron_test.npz
wav2vec2 feature test, (8813, 1024), (8813, 1)


ValueError: shapes (8813,1) and (8813,) not aligned: 1 (dim 1) != 8813 (dim 0)

In [17]:
np.squeeze(test_feat_y).shape

(8813,)

In [19]:
# Calculate the Pearson correlation coefficient
pearson_corr, _ = pearsonr(np.squeeze(test_feat_y), test_preds)


print(f"Pearson Correlation Coefficient: {pearson_corr}")

Pearson Correlation Coefficient: 0.7469743287759553


In [23]:
test_feat_x[0].shape

(1024,)

In [31]:
from sklearn.metrics import mean_squared_error
import time
start_time = time.time()

# Load or extract features for the dataset (e.g., 'test')
test_feat_x, test_feat_y = load_or_extract_features(args, 'test')

# Scale the features using the loaded scaler
test_feat_x = scaler.transform(test_feat_x)

# Use only the first sample
first_sample_x = test_feat_x  # Reshape to 2D array
first_sample_y = test_feat_y
print(first_sample_x.shape)
print(first_sample_y.shape)

# Make prediction using the loaded model
first_sample_pred = model.predict(first_sample_x)

# Calculate the Mean Squared Error (MSE)
mse_value = mean_squared_error([first_sample_y], first_sample_pred)

print(f"Mean Squared Error: {mse_value}")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Time taken for the process: {elapsed_time:.2f} seconds")

feature_file : datasets_full_list_feature_extracted/lang_en/pron_test.npz
Loading features from datasets_full_list_feature_extracted/lang_en/pron_test.npz
wav2vec2 feature test, (8813, 1024), (8813, 1)
(8813, 1024)
(8813, 1)


ValueError: Found input variables with inconsistent numbers of samples: [1, 8813]