In [1]:
import os
import numpy as np
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from tqdm import tqdm

In [2]:
# Initialize the pretrained Wav2Vec2 model and feature extractor
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)
min_duration = 4.0

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout)

In [4]:
def load_segment_labels(segment_labels_file):
    return np.load(segment_labels_file, allow_pickle=True).item()

def extract_features(audio_file, device, model, feature_extractor):
    # Load and preprocess audio file
    audio_input, _ = librosa.load(audio_file, sr=16000)
    
    # Extract input features for the XLSR model
    input_values = feature_extractor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
    input_values = input_values.to(device)
    
    # Get model's hidden states and the last CNN layer features
    with torch.no_grad():
        outputs = model(input_values, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]  # Last hidden state
        last_cnn_layer = outputs.last_hidden_state  # Replace this with actual last CNN layer if available
    
    # Return as numpy arrays
    return hidden_states.squeeze(0).cpu().numpy(), last_cnn_layer.squeeze(0).cpu().numpy()


def extract_and_save_features(audio_dir, segment_labels_file, output_dir, chunk_size=40000, max_files=7000):
    segment_labels_dict = load_segment_labels(segment_labels_file)

    hidden_states_list = []
    features_last_cnn_layer_list = []
    labels_list = []
    file_counter = 1
    processed_files = 0

    for audio_name, segment_labels in tqdm(segment_labels_dict.items(), desc="Extracting features"):
        # Stop processing if we've reached the max_files limit
        if processed_files >= max_files:
            break

        audio_file = os.path.join(audio_dir, audio_name + ".wav")
        if not os.path.exists(audio_file):
            print(f"File '{audio_file}' not found. Skipping...")
            continue

        # Extract features
        hidden_states, features_last_cnn_layer = extract_features(audio_file, device, model, feature_extractor)

        # Assign segment-level labels to each frame
        frame_labels = np.repeat(segment_labels, hidden_states.shape[0] // len(segment_labels))

        # Store features and labels
        hidden_states_list.append(hidden_states)
        features_last_cnn_layer_list.append(features_last_cnn_layer)
        labels_list.append(frame_labels)

        processed_files += 1  # Increment the count of processed files

        # Save in chunks
        if len(hidden_states_list) >= chunk_size:
            save_to_numpy(hidden_states_list, features_last_cnn_layer_list, labels_list, output_dir, file_counter)
            hidden_states_list, features_last_cnn_layer_list, labels_list = [], [], []
            file_counter += 1

    if hidden_states_list:
        save_to_numpy(hidden_states_list, features_last_cnn_layer_list, labels_list, output_dir, file_counter)
        
def extract_features(audio_file, device, model, feature_extractor):
    # Load and preprocess audio file
    audio_input, _ = librosa.load(audio_file, sr=16000)
    
    # Extract input features for the XLSR model
    input_values = feature_extractor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
    input_values = input_values.to(device)
    
    # Get model's hidden states and the last CNN layer features
    with torch.no_grad():
        outputs = model(input_values, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]  # Last hidden state
        last_cnn_layer = outputs.last_hidden_state  # Replace this with actual last CNN layer if available
    
    # Return as numpy arrays
    return hidden_states.squeeze(0).cpu().numpy(), last_cnn_layer.squeeze(0).cpu().numpy()

# Main feature extraction function
def extract_and_save_features(audio_dir, segment_labels_file, output_dir, chunk_size=40000, max_files=7000):
    segment_labels_dict = load_segment_labels(segment_labels_file)

    hidden_states_list = []
    features_last_cnn_layer_list = []
    labels_list = []
    file_counter = 1
    processed_files = 0

    for audio_name, segment_labels in tqdm(segment_labels_dict.items(), desc="Extracting features"):
        # Stop processing if we've reached the max_files limit
        if processed_files >= max_files:
            break

        audio_file = os.path.join(audio_dir, audio_name + ".wav")
        if not os.path.exists(audio_file):
            print(f"File '{audio_file}' not found. Skipping...")
            continue

        # Extract features
        hidden_states, features_last_cnn_layer = extract_features(audio_file, device, model, feature_extractor)

        # Assign segment-level labels to each frame
        frame_labels = np.repeat(segment_labels, hidden_states.shape[0] // len(segment_labels))

        # Store features and labels
        hidden_states_list.append(hidden_states)
        features_last_cnn_layer_list.append(features_last_cnn_layer)
        labels_list.append(frame_labels)

        processed_files += 1  # Increment the count of processed files

        # Save in chunks
        if len(hidden_states_list) >= chunk_size:
            save_to_numpy(hidden_states_list, features_last_cnn_layer_list, labels_list, output_dir, file_counter)
            hidden_states_list, features_last_cnn_layer_list, labels_list = [], [], []
            file_counter += 1

    if hidden_states_list:
        save_to_numpy(hidden_states_list, features_last_cnn_layer_list, labels_list, output_dir, file_counter)



In [None]:
def save_to_numpy(hidden_states_list, features_last_cnn_layer_list, labels_list, output_dir, file_counter):
    for i in range(len(hidden_states_list)):
        hidden_states_array = np.array(hidden_states_list[i])
        features_last_cnn_layer_array = np.array(features_last_cnn_layer_list[i])
        labels_array = np.array(labels_list[i])

        np.save(os.path.join(output_dir, f"XLSR_PS_hidd_framelevel_feat{file_counter:02d}_{i:04d}.npy"), hidden_states_array)
        np.save(os.path.join(output_dir, f"XLSR_PS_last_cnn_framelevel_feat{file_counter:02d}_{i:04d}.npy"), features_last_cnn_layer_array)
        np.save(os.path.join(output_dir, f"XLSR_PS_labels{file_counter:02d}_{i:04d}.npy"), labels_array)


In [None]:
if __name__ == "__main__":
    base_output_path = "F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features"
    
    train_output_path = os.path.join(base_output_path, "train")
    dev_output_path = os.path.join(base_output_path, "dev")
    eval_output_path = os.path.join(base_output_path, "eval")
    
    os.makedirs(train_output_path, exist_ok=True)
    os.makedirs(dev_output_path, exist_ok=True)
    os.makedirs(eval_output_path, exist_ok=True)

    extract_and_save_features("F:\\Awais_data\\Datasets\\PartialSpoof\\Train\\con_wav\\", 
                              "F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\train_seglab_0.16.npy", 
                              train_output_path, max_files=25380)

    extract_and_save_features("F:\\Awais_data\\Datasets\\PartialSpoof\\dev\\con_wav\\", 
                              "F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\dev_seglab_0.16.npy", 
                              dev_output_path, max_files=24000)

    extract_and_save_features("F:\\Awais_data\\Datasets\\PartialSpoof\\eval\\con_wav\\", 
                              "F:\\Awais_data\\Datasets\\PartialSpoof\\database_segment_labels\\database\\segment_labels\\eval_seglab_0.16.npy", 
                              eval_output_path, max_files=71000)

In [None]:
pwd

In [None]:
import numpy as np

# Check the content of a sample file
sample_features_file = "F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features\\train\\XLSR_PS_hidd_framelevel_feat01_0000.npy"
sample_labels_file = "F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features\\train\\XLSR_PS_labels01_0000.npy"

features = np.load(sample_features_file)
labels = np.load(sample_labels_file)

print("Features dtype:", features.dtype)
print("Labels dtype:", labels.dtype)
print("Features sample:", features[:5])
print("Labels sample:", labels[:5])


Classifiaction 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
import numpy as np
from tqdm import tqdm

In [None]:
# Define the MLP model for frame-level classification
class FrameLevelClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FrameLevelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [None]:
# Hyperparameters
config = {
    'hidden_dim': 1024,
    'output_dim': 1,  # Binary classification for each frame (real/spoof)
    'num_epochs': 50,
    'batch_size': 128,
    'learning_rate': 0.0001,
    'model_save_path': 'W2V_best_frame_level_model_PS' 
}

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

def load_data(features_file, labels_file):
    X = np.load(features_file)
    y = np.load(labels_file)

    # Convert labels from string to float if needed
    if np.issubdtype(y.dtype, np.str_):
        y = y.astype(float)
    
    # Ensure labels are of the correct shape
    if y.ndim == 1:
        y = y.reshape(-1, 1)
    
    # Align features and labels
    num_features = X.shape[0]
    num_labels = y.shape[0]
    
    if num_features != num_labels:
        print(f"Warning: Number of features ({num_features}) does not match number of labels ({num_labels}).")
        min_len = min(num_features, num_labels)
        X = X[:min_len]
        y = y[:min_len]

    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Define DataLoader for training, validation, and test sets
def get_data_loader(X, y, batch_size):
    # Check tensor sizes
    print(f"X size: {X.size()}, y size: {y.size()}")
    assert X.size(0) == y.size(0), "Size mismatch between features and labels"
    
    dataset = TensorDataset(X, y)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return loader

# Load datasets
X_train, y_train = load_data("F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features\\train\\XLSR_PS_hidd_framelevel_feat01_0000.npy", 
                             "F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features\\train\\XLSR_PS_labels01_0000.npy")
X_val, y_val = load_data("F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features\\dev\\XLSR_PS_hidd_framelevel_feat01_0000.npy", 
                         "F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features\\dev\\XLSR_PS_labels01_0000.npy")
X_test, y_test = load_data("F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features\\eval\\XLSR_PS_hidd_framelevel_feat01_0000.npy", 
                           "F:\\Awais_data\\Datasets\\PartialSpoof\\Features\\Frame_level_features\\eval\\XLSR_PS_labels01_0000.npy")

# Create DataLoaders for each dataset
train_loader = get_data_loader(X_train, y_train, batch_size=32)  # Adjust batch size as needed
val_loader = get_data_loader(X_val, y_val, batch_size=32)        # Adjust batch size as needed
test_loader = get_data_loader(X_test, y_test, batch_size=32)


In [None]:
import numpy as np
import torch

def pad_data(X, y, max_len):
    num_features = X.shape[0]
    num_labels = y.shape[0]

    if num_features < max_len:
        # Pad features
        pad_size = max_len - num_features
        X = np.pad(X, ((0, pad_size), (0, 0)), mode='constant', constant_values=0)
        
    if num_labels < max_len:
        # Pad labels
        pad_size = max_len - num_labels
        y = np.pad(y, ((0, pad_size), (0, 0)), mode='constant', constant_values=-1)
    
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Example usage
max_len = 142  # Use the maximum length from your dataset
X_train, y_train = pad_data(X_train, y_train, max_len)
X_val, y_val = pad_data(X_val, y_val, max_len)
X_test, y_test = pad_data(X_test, y_test, max_len)


In [None]:
def check_shapes(X, y):
    print(f"Features shape: {X.shape}")
    print(f"Labels shape: {y.shape}")
    if X.shape[0] != y.shape[0]:
        print("Warning: Number of features does not match number of labels after padding.")
    else:
        print("Features and labels match after padding.")

check_shapes(X_train, y_train)
check_shapes(X_val, y_val)
check_shapes(X_test, y_test)


In [None]:
def check_dataloader(loader):
    for batch in loader:
        X_batch, y_batch = batch
        print(f"Batch X shape: {X_batch.shape}")
        print(f"Batch y shape: {y_batch.shape}")
        if X_batch.shape[0] != y_batch.shape[0]:
            print("Warning: Batch features and labels do not match.")
        break  # Check only the first batch

check_dataloader(train_loader)
check_dataloader(val_loader)
check_dataloader(test_loader)


In [None]:
# Initialize the model
model = FrameLevelClassifier(X_train.shape[1], config['hidden_dim'], config['output_dim']).to(device)
criterion = nn.BCEWithLogitsLoss()  # For binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [None]:
# Training loop
def train_model(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for X_batch, y_batch in tqdm(dataloader, desc="Training", leave=False):
        X_batch, y_batch = X_batch.cuda(), y_batch.float().cuda()  # Ensure target is float for BCEWithLogitsLoss
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()  # Remove any singleton dimensions
        if outputs.dim() == 1:  # If outputs have shape [batch_size]
            outputs = outputs.unsqueeze(1)  # Make shape [batch_size, 1]
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_labels = []
    all_outputs = []
    num_batches = len(dataloader)
    
    with torch.no_grad():
        for batch_idx, (X_batch, y_batch) in enumerate(tqdm(dataloader, desc="Evaluation", leave=False)):
            X_batch, y_batch = X_batch.cuda(), y_batch.float().cuda()
            outputs = model(X_batch).squeeze()
            if outputs.dim() == 1:
                outputs = outputs.unsqueeze(1)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
            all_labels.extend(y_batch.cpu().numpy())
            all_outputs.extend(torch.sigmoid(outputs).cpu().numpy())  # Use sigmoid to convert logits to probabilities

            # Debug information: print batch index and batch size
            if batch_idx % 100 == 0:  # Print every 100 batches
                print(f"Processed batch {batch_idx}/{num_batches}, batch size: {len(X_batch)}")
    
    # Calculate metrics
    all_labels = np.array(all_labels)
    all_outputs = np.array(all_outputs)
    
    # Binary classification predictions
    predictions = (all_outputs > 0.5).astype(int)
    
    # Accuracy
    accuracy = accuracy_score(all_labels, predictions)
    
    # Precision, Recall, F1
    precision = precision_score(all_labels, predictions, zero_division=1)
    recall = recall_score(all_labels, predictions)
    f1 = f1_score(all_labels, predictions)
    
    # AUC
    auc = roc_auc_score(all_labels, all_outputs)
    
    # EER Calculation
    fpr, tpr, thresholds = roc_curve(all_labels, all_outputs)
    fnr = 1 - tpr
    eer_threshold = thresholds[np.nanargmin(np.abs(fnr - fpr))]
    eer = fpr[np.nanargmin(np.abs(fnr - fpr))]
    
    # Confusion Matrix
    cm = confusion_matrix(all_labels, predictions)
    
    return total_loss / num_batches, accuracy, precision, recall, f1, auc, eer, cm

In [None]:
# Start training
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix

# Early stopping parameters
epochs_no_improve = 10
n_epochs_stop = 10
best_val_loss = float('inf')
best_val_eer = float('inf')
best_model_path = config['model_save_path']

for epoch in range(config['num_epochs']):
    print(f'Starting epoch {epoch+1}/{config["num_epochs"]}')
    train_loss = train_model(model, train_val_loader, criterion, optimizer)
    
    # Validate using the validation set
    val_loss, val_accuracy, val_precision, val_recall, val_f1, val_auc, val_eer, cm = evaluate_model(model, create_dataloader(X_val, y_val, config['batch_size']), criterion)
    
    print(f'Epoch {epoch+1}/{config["num_epochs"]}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}, Val AUC: {val_auc:.4f}, Val EER: {val_eer:.4f}')
       
    if val_eer < best_val_eer:
        best_val_eer = val_eer
        best_val_loss = val_loss
        epochs_no_improve = 0
        
        # Save the current model
        torch.save(model.state_dict(), best_model_path)
        print(f'Saved best model with Val EER: {val_eer:.4f} to {best_model_path}')
    else:
        epochs_no_improve += 1
    
    # Early stopping
    if epochs_no_improve >= n_epochs_stop:
        print(f'Early stopping at epoch {epoch+1}')
        break

In [None]:
# Test the model
def test_model(model, test_loader, criterion):
    print("\nEvaluating the model on the test set:")
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    print(f"Test Loss: {test_loss}")
    print(f"Test Accuracy: {test_acc}")

In [None]:
# Testing the model
test_loss, test_accuracy, test_precision, test_recall, test_f1, test_auc, test_eer, test_cm = evaluate_model(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Test Precision: {test_precision:.4f}')
print(f'Test Recall: {test_recall:.4f}')
print(f'Test F1: {test_f1:.4f}')
print(f'Test AUC: {test_auc:.4f}')
print(f'Test EER: {test_eer:.4f}')
print(f'Test Confusion Matrix:\n{test_cm}')