## Creating Metafile

In [1]:
import os
import pandas as pd

In [9]:
def create_file_label_df(root_dir, output_csv):
    # Define folder paths relative to root_dir
    human_folder = os.path.join(root_dir, "data", "human1")
    robot_folder = os.path.join(root_dir, "data", "robot")

    # Initialize a list to hold file locations and labels
    data = []

    # Process Human files
    for filename in os.listdir(human_folder):
        if filename.endswith(".wav"):  # Check for .wav files
            file_location = os.path.join(human_folder, filename)
            data.append({"file_location": file_location, "label": "Human"})

    # Process Robot files
    for filename in os.listdir(robot_folder):
        if filename.endswith(".wav"):  # Check for .wav files
            file_location = os.path.join(robot_folder, filename)
            data.append({"file_location": file_location, "label": "Robot"})

    # Create a DataFrame from the list
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)

    print(f"DataFrame created and saved to: {output_csv}")


In [13]:
root_dir = "/home/arunb/Abhijeet_2021509"
output_csv = "/home/arunb/Abhijeet_2021509/data/metadata.csv"

In [14]:
create_file_label_df(root_dir, output_csv)

DataFrame created and saved to: /home/arunb/Abhijeet_2021509/data/metadata.csv


## Wav2Vec 2.0 Model

In [9]:
import pandas as pd
from IPython.display import Audio
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

In [10]:
data = pd.read_csv("/home/arunb/Abhijeet_2021509/data/metadata.csv")
test = data.iloc[0]['file_location']
Audio(test)

In [11]:
# Separate the data into two classes
robot_data = data[data['label'] == 'Robot']
human_data = data[data['label'] == 'Human']

# Sample 1100 entries from each class
robot_sample = robot_data.sample(n=100, random_state=42)
human_sample = human_data.sample(n=100, random_state=42)

# Combine the two samples into one DataFrame
sampled_data = pd.concat([robot_sample, human_sample])

In [1]:
# torch.cuda.empty_cache()
!nvidia-smi

Fri Oct 18 17:28:19 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:2F:00.0 Off |                    0 |
| N/A   41C    P0            262W /  400W |   24882MiB /  40960MiB |    100%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100-SXM4-40GB          Off |   00

In [4]:
gpu_id = 3
device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:3


In [6]:
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Function to load audio
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform

# Function to extract features using Wav2Vec 2.0
def extract_features(audio_path):
    waveform, sampling_rate = torchaudio.load(audio_path)

    # Convert stereo to mono by averaging the two channels
    if waveform.shape[0] == 2:
        waveform = waveform.mean(dim=0, keepdim=True)  # Shape becomes [1, length]

    # Process waveform and move input tensor to GPU
    input_values = processor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt").input_values.to(device)

    # Extract hidden states from the model and move result to CPU for further processing
    with torch.no_grad():
        hidden_states = model(input_values).last_hidden_state.cpu()
    
    # Take the mean of the hidden states across the time dimension to get a feature vector
    features = torch.mean(hidden_states, dim=1).squeeze().numpy()

    del hidden_states
    del input_values
    return features

In [8]:
# Initialize arrays for features and labels
X = []
y = []

# Map labels to binary values: 'Human' -> 0, 'Robot' -> 1
label_mapping = {'Human': 0, 'Robot': 1}

# Process audio files and extract features with progress tracking
for index, row in tqdm(sampled_data.iterrows(), total=sampled_data.shape[0], desc="Processing Audio Files"):
    audio_file = row['file_location']
    label = label_mapping.get(row['label'])
    features = extract_features(audio_file)
    X.append(features)
    y.append(label)

Processing Audio Files:   2%|▎         | 5/200 [00:06<03:54,  1.20s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.34 GiB (GPU 3; 39.50 GiB total capacity; 7.98 GiB already allocated; 5.80 GiB free; 8.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")