In [1]:
import os
import numpy as np
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix


In [2]:
# Initialize the pretrained Wav2Vec2 model and feature extractor
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)
min_duration = 4.0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Utilize all available GPUs
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)
    
model.to(device)

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout)

In [4]:
def load_segment_labels(segment_labels_file):
    return np.load(segment_labels_file, allow_pickle=True).item()

def extract_features(audio_file, device, model, feature_extractor):
    # Load and preprocess audio file
    audio_input, _ = librosa.load(audio_file, sr=16000)
    
    # Extract input features for the model
    input_values = feature_extractor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True).input_values
    input_values = input_values.to(device)
    
    # Get model's hidden states and the last CNN layer features
    with torch.no_grad():
        if isinstance(model, torch.nn.DataParallel):
            outputs = model.module(input_values)  # Use model.module to access the underlying model
        else:
            outputs = model(input_values)
        last_cnn_layer = outputs.last_hidden_state
    
    return last_cnn_layer.squeeze(0).cpu().numpy()

In [5]:
def extract_features_for_classification(audio_dir, segment_labels_file, device, model, feature_extractor, max_files):
    segment_labels_dict = load_segment_labels(segment_labels_file)
    features_last_cnn_layer_list = []
    labels_list = []
    processed_files = 0

    for audio_name, segment_labels in tqdm(segment_labels_dict.items(), desc="Extracting features"):
        if processed_files >= max_files:
            break

        audio_file = os.path.join(audio_dir, audio_name + ".wav")
        if not os.path.exists(audio_file):
            print(f"File '{audio_file}' not found. Skipping...")
            continue

        features_last_cnn_layer = extract_features(audio_file, device, model, feature_extractor)

        # Ensure segment labels are numeric (in case they're strings)
        if isinstance(segment_labels[0], str):
            segment_labels = [float(label) for label in segment_labels]

        # Assign segment-level labels to each frame
        frame_labels = np.repeat(segment_labels, features_last_cnn_layer.shape[0] // len(segment_labels))

        features_last_cnn_layer_list.append(features_last_cnn_layer)
        labels_list.append(frame_labels)
        processed_files += 1

    # Flatten labels and check for unique values
    all_labels_flat = np.concatenate(labels_list)
    print(f"Unique labels found: {np.unique(all_labels_flat)}")
    print(f"Label counts: {np.bincount(all_labels_flat.astype(int))}")

    return features_last_cnn_layer_list, labels_list

In [6]:
def get_max_length(features):
    return max([len(feat) for feat in features])


def pad_to_max_length(features, labels, max_length):
    """
    Pad or truncate features and labels to match the max length.
    
    Args:
    - features: list of feature arrays (e.g., [feature1, feature2, ...])
    - labels: list of corresponding labels (e.g., [label1, label2, ...])
    - max_length: the length to which features and labels should be padded or truncated.
    
    Returns:
    - padded_features: np.array of padded features
    - padded_labels: np.array of padded or truncated labels
    """
    padded_features = []
    padded_labels = []

    for i in range(len(features)):
        feature = features[i]
        label = labels[i]

        # Padding or truncating features
        if feature.shape[0] > max_length:
            padded_feature = feature[:max_length]  # Truncate to max_length
        else:
            padding = np.zeros((max_length - feature.shape[0], feature.shape[1]))
            padded_feature = np.vstack((feature, padding))  # Pad with zeros

        padded_features.append(padded_feature)

        # Ensure labels are scalar or truncate to a consistent length
        if isinstance(label, (list, np.ndarray)):
            if len(label) > max_length:
                padded_label = label[:max_length]  # Truncate labels
            else:
                padded_label = np.pad(label, (0, max_length - len(label)), mode='constant')  # Pad with zeros
        else:
            padded_label = label  # If it's already a scalar, no need for padding

        padded_labels.append(padded_label)

    # Convert to numpy arrays
    padded_features = np.array(padded_features)

    try:
        padded_labels = np.array(padded_labels)
    except ValueError as e:
        print(f"Error while converting labels to np.array: {e}")
        print(f"Shape of padded_labels: {[label.shape if isinstance(label, np.ndarray) else 'scalar' for label in padded_labels]}")

    return padded_features, padded_labels

In [10]:
if __name__ == "__main__":
    # Example setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h').to(device)
    feature_extractor = Wav2Vec2FeatureExtractor()
    
    
    # Process datasets
    train_features, train_labels = extract_features_for_classification(
        "F:\\Awais_data\\Datasets\\PartialSpoof\\Train\\con_wav\\",
        "F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\train_seglab_0.16.npy",
        device, model, feature_extractor,max_files=3000
    )
    dev_features, dev_labels = extract_features_for_classification(
        "F:\\Awais_data\\Datasets\\PartialSpoof\\dev\\con_wav\\",
        "F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\dev_seglab_0.16.npy",
        device, model, feature_extractor,max_files=3000
    )
    eval_features, eval_labels = extract_features_for_classification(
        "F:\\Awais_data\\Datasets\\PartialSpoof\\eval\\con_wav\\",
        "F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\eval_seglab_0.16.npy",
        device, model, feature_extractor,max_files=3000
    )

    # Determine max length for padding
    max_len = get_max_length(train_features)

    # Pad features and labels
    X_train, y_train = pad_to_max_length(train_features, train_labels, max_len)
    X_val, y_val = pad_to_max_length(dev_features, dev_labels, max_len)
    X_eval, y_eval = pad_to_max_length(eval_features, eval_labels, max_len)

    # Convert to tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    X_val = torch.tensor(X_val, dtype=torch.float32)
    y_val = torch.tensor(y_val, dtype=torch.float32)
    X_eval = torch.tensor(X_eval, dtype=torch.float32)
    y_eval = torch.tensor(y_eval, dtype=torch.float32)

    # # Create DataLoader
    train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=8, shuffle=True)
    val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_val, y_val), batch_size=8, shuffle=False)
    eval_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_eval, y_eval), batch_size=8, shuffle=False)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting features:  12%|██████▌                                                 | 3000/25380 [00:44<05:33, 67.10it/s]


Unique labels found: [0. 1.]
Label counts: [ 30751 416388]


Extracting features:  12%|██████▊                                                 | 3000/24844 [00:48<05:52, 62.00it/s]


Unique labels found: [0. 1.]
Label counts: [ 31486 427665]


Extracting features:   4%|██▎                                                     | 3000/71237 [01:36<36:23, 31.25it/s]


Unique labels found: [0. 1.]
Label counts: [    56 455189]


In [11]:
import numpy as np
import torch
from torch.utils.data import DataLoader, WeightedRandomSampler, TensorDataset

# Define the configuration dictionary
config = {
    'batch_size': 32,
    'learning_rate': 0.001,
    'num_epochs': 10,
    'model_save_path': 'best_model.pth',
    'max_len': 2000,  # Example maximum length
}

# Pad features and labels dynamically to max length
def pad_to_max_length(features, labels, max_len):
    padded_features = []
    padded_labels = []

    for feat, lbl in zip(features, labels):
        feat_len = feat.shape[0]

        if feat_len < max_len:
            padding_feat = np.zeros((max_len - feat_len, feat.shape[1]))  # Pad features
            padding_lbl = np.full((max_len - feat_len,), lbl[-1])  # Pad labels with the last value
            feat_padded = np.vstack([feat, padding_feat])
            lbl_padded = np.concatenate([lbl, padding_lbl])
        else:
            feat_padded = feat[:max_len]
            lbl_padded = lbl[:max_len]

        padded_features.append(feat_padded)
        padded_labels.append(lbl_padded)

    # Convert lists to numpy arrays after ensuring all elements are consistent in shape
    padded_features = np.array([np.array(x) for x in padded_features])
    padded_labels = np.array([np.array(x) for x in padded_labels])

    return padded_features, padded_labels

# Reshape and handle label segmentation dynamically
def reshape_and_pad(y_train, X_train):
    frames_per_segment = X_train.shape[1]  # Number of frames per segment
    total_frames = y_train.shape[0]
    remainder = total_frames % frames_per_segment

    if remainder > 0:
        # Calculate padding needed
        padding_length = frames_per_segment - remainder
        # Reshape padding to match the second dimension of y_train
        padding = y_train[-1].repeat(padding_length, 1)  # Repeat last label with the same shape as y_train
        y_train_padded = torch.cat([y_train, padding])
        print(f"Padding {padding_length} frames to y_train. New shape: {y_train_padded.shape}")
    else:
        y_train_padded = y_train

    # Reshape y_train to match segments
    y_train_reshaped = y_train_padded.view(-1, frames_per_segment)
    y_train_segmented = y_train_reshaped.flatten()  # Flatten for class balance

    print(f"Segmented y_train to shape: {y_train_segmented.shape}")
    return y_train_segmented

# Process datasets dynamically
def process_and_pad_features(train_features, train_labels, max_len):
    X_train, y_train = pad_to_max_length(train_features, train_labels, max_len)
    
    # Convert to tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    
    print(f"Initial shape of X_train: {X_train.shape}, y_train: {y_train.shape}")
    
    # Reshape and handle label segmentation dynamically
    y_train_segmented = reshape_and_pad(y_train, X_train)
    
    return X_train, y_train_segmented

# Create a weighted sampler for class balancing
def create_weighted_sampler(y_train_segmented):
    class_counts = torch.bincount(y_train_segmented.long())
    weights = 1.0 / class_counts.float()
    sample_weights = weights[y_train_segmented.long()]
    sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
    return sampler

# Define a simple model (replace with your own model)
class YourModel(torch.nn.Module):
    def __init__(self):
        super(YourModel, self).__init__()
        self.fc = torch.nn.Linear(768, 2)  # Example linear layer (adjust as necessary)

    def forward(self, x):
        return self.fc(x)

# Training function
def train_epoch(model, data_loader, criterion, optimizer):
    model.train()
    total_loss = 0.0

    for X_batch, y_batch in data_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch.long())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(data_loader)

# Evaluation function
def evaluate_model(model, data_loader, criterion):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch.long())
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    # Calculate metrics
    from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    auc = roc_auc_score(all_labels, all_preds)
    
    return total_loss / len(data_loader), precision, recall, f1, auc

# Main function
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

       # Process datasets
    X_train, y_train_segmented = process_and_pad_features(train_features, train_labels, config['max_len'])
    X_val, y_val = process_and_pad_features(dev_features, dev_labels, config['max_len'])
    X_eval, y_eval = process_and_pad_features(eval_features, eval_labels, config['max_len'])

    # Ensure the size match after reshaping and padding
    if X_train.shape[0] != y_train_segmented.shape[0]:
        raise ValueError("Mismatch between the number of samples in X_train and y_train_segmented.")
    
    # Create a weighted sampler for class balancing
    sampler = create_weighted_sampler(y_train_segmented)
    
    # Create datasets
    train_dataset = TensorDataset(X_train, y_train_segmented)
    val_dataset = TensorDataset(X_val, y_val)
    eval_dataset = TensorDataset(X_eval, y_eval)
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, sampler=sampler, batch_size=config['batch_size'])
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
    eval_loader = DataLoader(eval_dataset, batch_size=config['batch_size'], shuffle=False)
    
    # Model, optimizer, and loss function setup
    model = YourModel().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    criterion = torch.nn.CrossEntropyLoss()

    best_val_loss = float('inf')
    best_auc = 0.0

    # Training loop
    for epoch in range(config['num_epochs']):
        train_loss = train_epoch(model, train_loader, criterion, optimizer)
        val_loss, val_precision, val_recall, val_f1, val_auc = evaluate_model(model, val_loader, criterion)
        
        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val AUC = {val_auc:.4f}")
        print(f"Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")
        
        # Save the best model based on validation loss and AUC
        if val_loss < best_val_loss and val_auc > best_auc:
            best_val_loss = val_loss
            best_auc = val_auc
            torch.save(model.state_dict(), config['model_save_path'])
            print(f"Best model saved with Val Loss = {val_loss:.4f}, Val AUC = {val_auc:.4f}")

    # Testing with the best saved model
    print("Testing on Evaluation Data")
    model.load_state_dict(torch.load(config['model_save_path']))
    test_loss, test_precision, test_recall, test_f1, test_auc = evaluate_model(model, eval_loader, criterion)

    print(f"Test Loss: {test_loss:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}")
    print(f"Test F1: {test_f1:.4f}, Test AUC: {test_auc:.4f}")


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3000,) + inhomogeneous part.

In [None]:
class FrameLevelClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FrameLevelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [None]:
config = {
    'hidden_dim': 1024,
    'output_dim': 1,  # Binary classification
    'num_epochs': 50,
    'learning_rate': 0.0001,
    'model_save_path': 'best_model.pth'
}

In [None]:
# Initialize the model
model = FrameLevelClassifier(X_train.shape[2], config['hidden_dim'], config['output_dim']).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])

In [None]:
# Tracking best model based on AUC and validation loss
best_auc = 0
best_val_loss = float('inf')

In [None]:
# Updated metrics evaluation function to handle errors
def evaluate_metrics(true_labels, pred_labels):
    # Ensure predictions are binary
    unique_labels = np.unique(pred_labels)
    
    # Handle binary case for precision, recall, and F1
    precision = precision_score(true_labels, pred_labels, average='weighted', zero_division=0)
    recall = recall_score(true_labels, pred_labels, average='weighted', zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)
    
    # AUC calculation only if both classes are present
    auc = None
    if len(unique_labels) == 2:
        auc = roc_auc_score(true_labels, pred_labels, average='weighted', multi_class='ovo')
    else:
        print("AUC calculation skipped: Only one class present in predictions.")
    
    # Calculate EER (Equal Error Rate)
    fpr, tpr, thresholds = roc_curve(true_labels.ravel(), pred_labels.ravel())
    fnr = 1 - tpr
    eer_threshold = thresholds[np.nanargmin(np.absolute((fnr - fpr)))]
    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]  # EER is the point where FPR == FNR
    
    return precision, recall, f1, auc, eer

In [None]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0
    
    for X_batch, y_batch in tqdm(loader, desc="Training", leave=False):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(loader)

In [None]:
# Modify your evaluation loop to handle missing AUC calculations
def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_outputs = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
            all_outputs.append(torch.sigmoid(outputs).cpu().numpy())
            all_labels.append(y_batch.cpu().numpy())

    # Flatten lists
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)
    
    # Binarize outputs
    all_outputs_bin = (all_outputs > 0.5).astype(int)

    # Ensure true labels are binary for metrics calculation
    all_labels_bin = (all_labels > 0.5).astype(int)

    precision, recall, f1, auc, eer = evaluate_metrics(all_labels_bin, all_outputs_bin)
    return total_loss / len(loader), precision, recall, f1, auc, eer


In [None]:
for epoch in range(config['num_epochs']):
    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_precision, val_recall, val_f1, val_auc, val_eer = evaluate_model(model, val_loader, criterion)

    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val AUC = {val_auc:.4f}, Val EER = {val_eer:.4f}")
    print(f"Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")

    # Save best model based on validation loss and AUC
    if val_loss < best_val_loss and (val_auc is None or val_auc > best_auc):
        best_val_loss = val_loss
        best_auc = val_auc
        torch.save(model.state_dict(), config['model_save_path'])
        print(f"Best model saved with Val Loss = {val_loss:.4f}, Val AUC = {val_auc:.4f}, Val EER = {val_eer:.4f}")



In [None]:

# Testing with best saved model
print("Testing on Evaluation Data")
model.load_state_dict(torch.load(config['model_save_path']))
test_loss, test_precision, test_recall, test_f1, test_auc, test_eer = evaluate_model(model, eval_loader, criterion)

print(f"Test Loss: {test_loss:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}")
print(f"Test F1: {test_f1:.4f}, Test AUC: {test_auc:.4f}, Test EER: {test_eer:.4f}")
