In [3]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import numpy as np

In [5]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Create a synthetic audio sample (4 seconds of random noise, sampled at 16000 Hz)
sample_rate = 16000
duration = 4  # 4 seconds
audio = np.random.randn(duration * sample_rate).astype(np.float32)

# Dummy labels (one label per segment, just for demonstration)
segment_length = 400      # 400 samples per segment
hop_length = 160          # Overlap of 160 samples
num_segments = (len(audio) - segment_length) // hop_length + 1
dummy_labels = np.random.randint(0, 2, size=num_segments)

In [7]:
# Define the feature extraction function
def extract_segment_embeddings(audio, segment_length=400, hop_length=160):
    segments = [audio[i:i+segment_length] for i in range(0, len(audio) - segment_length, hop_length)]
    embeddings = []
    
    for segment in segments:
        # Preprocess each segment
        inputs = processor(segment, sampling_rate=sample_rate, return_tensors="pt", padding=True)
        
        # Extract embeddings without computing gradients
        with torch.no_grad():
            hidden_states = model(**inputs).last_hidden_state
        
        # Average across time dimension to obtain a 1D embedding for each segment
        segment_embedding = torch.mean(hidden_states, dim=1).squeeze()
        embeddings.append(segment_embedding.cpu().numpy())
    
    return np.array(embeddings)

In [8]:
# Extract segment embeddings and print feature shape
segment_embeddings = extract_segment_embeddings(audio)
print("Shape of segment embeddings:", segment_embeddings.shape)

# Display the shape of features and the dummy labels
print("Shape of dummy labels:", dummy_labels.shape)
print("Features (first segment):", segment_embeddings[0])
print("Dummy labels:", dummy_labels)


NameError: name 'processor' is not defined

In [12]:
import numpy as np
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Load the feature extractor and model
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
# model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")

model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

# Create a synthetic audio sample (4 seconds of random noise, sampled at 16000 Hz)
sample_rate = 16000
duration = 4  # 4 seconds
audio = np.random.randn(duration * sample_rate).astype(np.float32)

# Dummy labels (one label per segment, just for demonstration)
segment_length = 400      # 400 samples per segment
hop_length = 160          # Overlap of 160 samples
num_segments = (len(audio) - segment_length) // hop_length + 1
dummy_labels = np.random.randint(0, 2, size=num_segments)

# Define the feature extraction function
def extract_segment_embeddings(audio, segment_length=400, hop_length=160):
    segments = [audio[i:i+segment_length] for i in range(0, len(audio) - segment_length, hop_length)]
    embeddings = []
    
    for segment in segments:
        # Preprocess each segment using the feature extractor
        inputs = feature_extractor(segment, sampling_rate=sample_rate, return_tensors="pt", padding=True)
        
        # Extract embeddings without computing gradients
        with torch.no_grad():
            hidden_states = model(**inputs).last_hidden_state
        
        # Average across time dimension to obtain a 1D embedding for each segment
        segment_embedding = torch.mean(hidden_states, dim=1).squeeze()
        embeddings.append(segment_embedding.cpu().numpy())
    
    return np.array(embeddings)

# Extract segment embeddings and print feature shape
segment_embeddings = extract_segment_embeddings(audio)
print("Shape of segment embeddings:", segment_embeddings.shape)

# Display the shape of features and the dummy labels
print("Shape of dummy labels:", dummy_labels.shape)
print("Features (first segment):", segment_embeddings[0])
print("Dummy labels:", dummy_labels)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Shape of segment embeddings: (398, 1024)
Shape of dummy labels: (398,)
Features (first segment): [-0.03088856  0.21337077  0.00226705 ... -0.08575208  0.01292966
 -0.02562373]
Dummy labels: [0 1 0 0 1 0 0 0 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 1 0 1 0
 1 0 0 0 1 0 1 1 0 0 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 1 0 1 0 1
 1 0 1 0 1 1 0 0 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 1 0 1 1 0 1 0 0 0 0 0 0 1 1
 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 1
 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 0
 0 0 0 0 1 0 1 1 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 0 1 1 0
 0 0 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 1 1 0 0 0 1 0 0 0
 0 1 1 1 1 0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0
 1 1 1 0 1 1 0 0 0 0 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1
 0 1 1 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 1 0 1 0 0 1 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0]


In [17]:
# Replace with your desired output file name
import pandas as pd

# Define the input and output CSV file paths
input_csv_file = 'C:\\Notebooks\\rrl_source\\dataset_raw\\train_segment_Wav2Vec2.csv'  # Replace with your input file name
output_csv_file = 'C:\\Notebooks\\rrl_source\\dataset_raw\\train_segment_W2V2.csv'  # Replace with your desired output file name

# Read the CSV file
df = pd.read_csv(input_csv_file)
print(df.head()) 

# Rename the feature columns
feature_columns = [f'Wav2Vec2_Feature_{i}' for i in range(1, 1025)]  # Update range to 1 to 1024
new_feature_columns = [f'W2V_F_{i}' for i in range(1, 1025)]  # Update range to 1 to 1024

# Check if the expected columns exist in the dataframe
if all(col in df.columns for col in feature_columns):
    # Create a mapping dictionary for renaming
    rename_dict = dict(zip(feature_columns, new_feature_columns))
    
    # Rename the columns
    df.rename(columns=rename_dict, inplace=True)
    
    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv_file, index=False)
    print(f'Successfully saved the modified DataFrame to {output_csv_file}')
else:
    print("Error: The expected feature columns do not exist in the input file.")



          FileID  Wav2Vec2_Feature_1  Wav2Vec2_Feature_2  Wav2Vec2_Feature_3  \
0  CON_T_0000000           -0.031782            0.207258            0.001993   
1  CON_T_0000000           -0.031079            0.209909            0.002559   
2  CON_T_0000000           -0.031795            0.207865            0.002931   
3  CON_T_0000000           -0.032433            0.206759            0.003525   
4  CON_T_0000000           -0.031276            0.214853            0.002343   

   Wav2Vec2_Feature_4  Wav2Vec2_Feature_5  Wav2Vec2_Feature_6  \
0           -0.014273           -0.040113            0.030700   
1           -0.014819           -0.041912            0.031210   
2           -0.014959           -0.042345            0.032232   
3           -0.014638           -0.044052            0.032079   
4           -0.013823           -0.040726            0.029733   

   Wav2Vec2_Feature_7  Wav2Vec2_Feature_8  Wav2Vec2_Feature_9  ...  \
0            0.044732            0.028875            0.000