In [1]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import torch
import torchaudio
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import Audio
import torch.multiprocessing as mp
from multiprocessing import Process, Manager
from transformers import Wav2Vec2FeatureExtractor, WavLMModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


In [3]:
def extract_features(audio_path, processor, model, device):
    waveform, sampling_rate = torchaudio.load(audio_path)

    target_sampling_rate = 16000
    if sampling_rate != target_sampling_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=target_sampling_rate)
        waveform = resampler(waveform)

    if waveform.shape[0] == 2:
        waveform = waveform.mean(dim=0, keepdim=True)

    input_values = processor(waveform.squeeze(0), sampling_rate=target_sampling_rate, return_tensors="pt").input_values.to(device)

    with torch.no_grad():
        hidden_states = model(input_values).last_hidden_state.cpu()

    features = torch.mean(hidden_states, dim=1).squeeze().numpy()

    return features

def process_audio_files(data_split, gpu_id, return_dict):
    device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu')
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base")
    model = WavLMModel.from_pretrained("microsoft/wavlm-base").to(device)
    
    X_split = []
    y_split = []
    label_mapping = {'Human': 0, 'Robot': 1}

    print(f"Starting processing on GPU {gpu_id} with {len(data_split)} samples.")

    for _, row in tqdm(data_split.iterrows(), total=data_split.shape[0], desc=f"Processing on GPU {gpu_id}"):
        audio_file = row['file_location']
        label = label_mapping.get(row['label'])
        embeddings = extract_features(audio_file, feature_extractor, model, device)
        X_split.append(embeddings)
        y_split.append(label)
    
    return_dict[gpu_id] = (X_split, y_split)
    print(f"Completed processing on GPU {gpu_id}.")


In [5]:
data = pd.read_csv("/home/arunb/Abhijeet_2021509/data/metadata.csv", nrows=12470)
print(data.shape)
test = data.iloc[0]['file_location']
Audio(test)

(12470, 2)


In [None]:
# gpu_id = 0
# device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu')
# print(device)

In [None]:
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base")
# model = WavLMModel.from_pretrained("microsoft/wavlm-base").to(device)

In [6]:
num_gpus = torch.cuda.device_count()
data_splits = np.array_split(data, num_gpus)  # Split data across GPUs

manager = mp.Manager()
return_dict = manager.dict()  # Shared dictionary to store results from each process
processes = []

# Launch processes
for gpu_id, data_split in enumerate(data_splits):
    p = mp.Process(target=process_audio_files, args=(data_split, gpu_id, return_dict))
    p.start()
    processes.append(p)

# Wait for all processes to complete
for p in processes:
    p.join()

# Collect results from all GPUs
X = []
y = []
for gpu_id in range(num_gpus):
    X_split, y_split = return_dict[gpu_id]
    X.extend(X_split)
    y.extend(y_split)

# Convert X and y to numpy arrays if needed
X = np.array(X)
y = np.array(y)

Starting processing on GPU 3 with 3117 samples.


Processing on GPU 3:   0%|          | 0/3117 [00:00<?, ?it/s]

Starting processing on GPU 0 with 3118 samples.


Processing on GPU 0:   0%|          | 0/3118 [00:00<?, ?it/s]

Starting processing on GPU 2 with 3117 samples.


Processing on GPU 2:   0%|          | 0/3117 [00:00<?, ?it/s]

Starting processing on GPU 1 with 3118 samples.


Processing on GPU 3: 100%|██████████| 3117/3117 [1:05:05<00:00,  1.25s/it]


Completed processing on GPU 3.


Processing on GPU 1: 100%|██████████| 3118/3118 [1:07:31<00:00,  1.30s/it]


Completed processing on GPU 1.


Processing on GPU 0: 100%|██████████| 3118/3118 [1:08:08<00:00,  1.31s/it]


Completed processing on GPU 0.


Processing on GPU 2: 100%|██████████| 3117/3117 [1:08:12<00:00,  1.31s/it]


Completed processing on GPU 2.


In [6]:
torch.cuda.empty_cache()
!nvidia-smi

Sun Nov 10 14:17:11 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:2F:00.0 Off |                    0 |
| N/A   31C    P0             56W /  400W |    1426MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100-SXM4-40GB          Off |   00

In [7]:
X = np.array(X)
y = np.array(y)

np.save('/home/arunb/Abhijeet_2021509/WavLM/embeddings/features.npy', X)
np.save('/home/arunb/Abhijeet_2021509/WavLM/embeddings/labels.npy', y)

In [5]:
X = np.load('/home/arunb/Abhijeet_2021509/WavLM/embeddings/features.npy')
y = np.load('/home/arunb/Abhijeet_2021509/WavLM/embeddings/labels.npy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

for name, clf in classifiers.items():
    print(f"\nClassifier: {name}")
    
    clf.fit(X_train, y_test)
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.2f}")
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print()


Classifier: Random Forest
Test Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2231
           1       1.00      0.98      0.99       263

    accuracy                           1.00      2494
   macro avg       1.00      0.99      0.99      2494
weighted avg       1.00      1.00      1.00      2494



Classifier: Logistic Regression
Test Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2231
           1       1.00      0.99      1.00       263

    accuracy                           1.00      2494
   macro avg       1.00      1.00      1.00      2494
weighted avg       1.00      1.00      1.00      2494



Classifier: SVM
Test Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2231
           1       1.00      

## Extensive Testing

In [19]:
train_df = pd.read_csv("/home/arunb/Abhijeet_2021509/data/train30%.csv")
test_df = pd.read_csv("/home/arunb/Abhijeet_2021509/data/test30%.csv").head(8820)

In [13]:
train_df.shape, test_df.shape

((3648, 2), (8820, 2))

In [None]:
num_gpus = torch.cuda.device_count()
data_splits = np.array_split(train_df, num_gpus)

manager = mp.Manager()
return_dict = manager.dict()  # Shared dictionary to store results from each process
processes = []

# Launch processes
for gpu_id, data_split in enumerate(data_splits):
    p = mp.Process(target=process_audio_files, args=(data_split, gpu_id, return_dict))
    p.start()
    processes.append(p)

# Wait for all processes to complete
for p in processes:
    p.join()

# Collect results from all GPUs
X_train = []
y_train = []
for gpu_id in range(num_gpus):
    X_split, y_split = return_dict[gpu_id]
    X_train.extend(X_split)
    y_train.extend(y_split)

# Convert X and y to numpy arrays if needed
X_train = np.array(X_train)
y_train = np.array(y_train)

Starting processing on GPU 3 with 912 samples.


Processing on GPU 3:   0%|          | 0/912 [00:00<?, ?it/s]

Starting processing on GPU 0 with 912 samples.


Processing on GPU 0:   0%|          | 0/912 [00:00<?, ?it/s]

Starting processing on GPU 2 with 912 samples.


Processing on GPU 2:   0%|          | 0/912 [00:00<?, ?it/s]

Starting processing on GPU 1 with 912 samples.


Processing on GPU 0:  96%|█████████▌| 872/912 [19:38<00:52,  1.32s/it]


Completed processing on GPU 3.


Processing on GPU 0: 100%|██████████| 912/912 [20:19<00:00,  1.34s/it]


Completed processing on GPU 0.


Processing on GPU 1: 100%|██████████| 912/912 [20:20<00:00,  1.34s/it]


Completed processing on GPU 1.


Processing on GPU 2: 100%|██████████| 912/912 [20:21<00:00,  1.34s/it]


Completed processing on GPU 2.


In [7]:
X_train = np.array(X_train)
y_train = np.array(y_train)

np.save('/home/arunb/Abhijeet_2021509/WavLM/embeddings/train30%_features.npy', X_train)
np.save('/home/arunb/Abhijeet_2021509/WavLM/embeddings/train30%_labels.npy', y_train)

In [17]:
X_train = np.load('/home/arunb/Abhijeet_2021509/WavLM/embeddings/train30%_features.npy')
y_train = np.load('/home/arunb/Abhijeet_2021509/WavLM/embeddings/train30%_labels.npy')

In [14]:
num_gpus = torch.cuda.device_count()
data_splits = np.array_split(test_df, num_gpus)  # Split data across GPUs

manager = mp.Manager()
return_dict = manager.dict()  # Shared dictionary to store results from each process
processes = []

# Launch processes
for gpu_id, data_split in enumerate(data_splits):
    p = mp.Process(target=process_audio_files, args=(data_split, gpu_id, return_dict))
    p.start()
    processes.append(p)

# Wait for all processes to complete
for p in processes:
    p.join()

# Collect results from all GPUs
X_test = []
y_test = []
for gpu_id in range(num_gpus):
    X_split, y_split = return_dict[gpu_id]
    X_test.extend(X_split)
    y_test.extend(y_split)

# Convert X and y to numpy arrays if needed
X_test = np.array(X_test)
y_test = np.array(y_test)

Starting processing on GPU 0 with 2205 samples.


Processing on GPU 0:   0%|          | 1/2205 [00:00<29:19,  1.25it/s]

Starting processing on GPU 1 with 2205 samples.


Processing on GPU 0:   0%|          | 2/2205 [00:01<21:15,  1.73it/s]

Starting processing on GPU 2 with 2205 samples.Starting processing on GPU 3 with 2205 samples.



Processing on GPU 0: 100%|██████████| 2205/2205 [45:21<00:00,  1.23s/it]
Processing on GPU 3:  98%|█████████▊| 2150/2205 [45:20<01:00,  1.10s/it]

Completed processing on GPU 0.


Processing on GPU 3: 100%|██████████| 2205/2205 [46:09<00:00,  1.26s/it]


Completed processing on GPU 3.


Processing on GPU 2: 100%|██████████| 2205/2205 [47:52<00:00,  1.30s/it]


Completed processing on GPU 2.


Processing on GPU 1: 100%|██████████| 2205/2205 [48:27<00:00,  1.32s/it]


Completed processing on GPU 1.


In [15]:
X_test = np.array(X_test)
y_test = np.array(y_test)

np.save('/home/arunb/Abhijeet_2021509/WavLM/embeddings/test30%_features.npy', X_test)
np.save('/home/arunb/Abhijeet_2021509/WavLM/embeddings/test30%_labels.npy', y_test)

In [21]:
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

for name, clf in classifiers.items():
    print(f"\nClassifier: {name}")
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.2f}")
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print()


Classifier: Random Forest


Test Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7451
           1       1.00      0.97      0.98      1369

    accuracy                           1.00      8820
   macro avg       1.00      0.99      0.99      8820
weighted avg       1.00      1.00      1.00      8820



Classifier: Logistic Regression
Test Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7451
           1       1.00      0.99      0.99      1369

    accuracy                           1.00      8820
   macro avg       1.00      1.00      1.00      8820
weighted avg       1.00      1.00      1.00      8820



Classifier: SVM
Test Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7451
           1       1.00      1.00      1.00      1369

 