## Human Call splitting

In [12]:
import os
from pydub import AudioSegment

In [9]:
def split_audio_files(input_folder, output_folder, chunk_duration=25 * 1000):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".mp3"):
            input_file_path = os.path.join(input_folder, filename)

            # Load audio file
            audio = AudioSegment.from_mp3(input_file_path)
            # Split audio into chunks of specified duration
            for i in range(0, len(audio), chunk_duration):
                chunk = audio[i:i + chunk_duration]

                # Convert the chunk to 16kHz
                chunk = chunk.set_frame_rate(16000)

                # Create a new filename for each chunk
                chunk_filename = f"{os.path.splitext(filename)[0]}_chunk{i // chunk_duration}.wav"
                output_file_path = os.path.join(output_folder, chunk_filename)

                # Export the chunk as .wav
                chunk.export(output_file_path, format="wav")
                print(f"Processed and saved: {output_file_path}")


In [10]:
input_folder = r'D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\CallHome'  # Replace with your actual folder path
output_folder = r'D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human'

In [11]:
split_audio_files(input_folder, output_folder)

Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk0.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk1.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk2.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk3.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk4.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk5.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk6.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk7.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\human2\0638_chunk8.wav
Processed and saved: D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\

## Creating Metafile

In [23]:
import os
import pandas as pd

In [24]:
def create_file_label_df(root_dir, output_csv):
    # Define folder paths relative to root_dir
    human_folder = os.path.join(root_dir, "data", "human")
    robot_folder = os.path.join(root_dir, "data", "robot")

    # Initialize a list to hold file locations and labels
    data = []

    # Process Human files
    for filename in os.listdir(human_folder):
        if filename.endswith(".wav"):  # Check for .wav files
            file_location = os.path.join(human_folder, filename)
            data.append({"file_location": file_location, "label": "Human"})

    # Process Robot files
    for filename in os.listdir(robot_folder):
        if filename.endswith(".wav"):  # Check for .wav files
            file_location = os.path.join(robot_folder, filename)
            data.append({"file_location": file_location, "label": "Robot"})

    # Create a DataFrame from the list
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)

    print(f"DataFrame created and saved to: {output_csv}")


In [25]:
root_dir = "D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset"
output_csv = "D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\metadata.csv"

In [26]:
create_file_label_df(root_dir, output_csv)

DataFrame created and saved to: D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset\data\metadata.csv


## Wav2Vec 2.0 Model

In [28]:
import pandas as pd
from IPython.display import Audio
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

### Embedding Creation

In [30]:
data = pd.read_csv("D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\Dataset\\data\\metadata.csv")
test = data.iloc[0]['file_location']
Audio(test)

In [34]:
data

Unnamed: 0,file_location,label
0,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Human
1,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Human
2,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Human
3,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Human
4,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Human
...,...,...
12521,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Robot
12522,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Robot
12523,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Robot
12524,D:\IIIT Delhi\7th Semester\Courses\BTP\Dataset...,Robot


In [8]:
# Separate the data into two classes
robot_data = data[data['label'] == 'Robot']
human_data = data[data['label'] == 'Human']

# Sample 1100 entries from each class
robot_sample = robot_data.sample(n=250, random_state=42)
human_sample = human_data.sample(n=250, random_state=42)

# Combine the two samples into one DataFrame
sampled_data = pd.concat([robot_sample, human_sample])

In [9]:
!nvidia-smi

Thu Oct 17 17:43:47 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650 Ti   WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   59C    P8              3W /   50W |     170MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [11]:
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)

  WeightNorm.apply(module, name, dim)
Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2Model: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Function to load audio
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform

# Function to extract features using Wav2Vec 2.0
def extract_features(audio_path):
    waveform, sampling_rate = torchaudio.load(audio_path)

    # Convert stereo to mono by averaging the two channels
    if waveform.shape[0] == 2:
        waveform = waveform.mean(dim=0, keepdim=True)  # Shape becomes [1, length]

    # Process waveform and move input tensor to GPU
    input_values = processor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt").input_values.to(device)

    # Extract hidden states from the model and move result to CPU for further processing
    with torch.no_grad():
        hidden_states = model(input_values).last_hidden_state.cpu()
    
    # Take the mean of the hidden states across the time dimension to get a feature vector
    features = torch.mean(hidden_states, dim=1).squeeze().numpy()

    return features

In [13]:
X = []
y = []

# Map labels to binary values: 'Human' -> 0, 'Robot' -> 1
label_mapping = {'Human': 0, 'Robot': 1}

# Process audio files and extract features with progress tracking
for index, row in tqdm(sampled_data.iterrows(), total=sampled_data.shape[0], desc="Processing Audio Files"):
    audio_file = row['file_location']
    label = label_mapping.get(row['label'])
    features = extract_features(audio_file)
    X.append(features)
    y.append(label)

Processing Audio Files: 100%|██████████| 200/200 [01:51<00:00,  1.80it/s]


In [14]:
X = np.array(X)
y = np.array(y)

# Save arrays to .npy files
np.save('D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\wav2vec2.0\\features200.npy', X)
np.save('D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\wav2vec2.0\\labels200.npy', y)

### Model Testing

200 Values (100 Positive, 100 Negative)

In [16]:
X = np.load('D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\wav2vec2.0\\features200.npy')
y = np.load('D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\wav2vec2.0\\labels200.npy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest binary classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [17]:
y_pred = clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

# Print the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Test Accuracy: 0.93
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.84      0.91        19
           1       0.88      1.00      0.93        21

    accuracy                           0.93        40
   macro avg       0.94      0.92      0.92        40
weighted avg       0.93      0.93      0.92        40



400 Values (200 Positive, 200 Negative)

In [19]:
X = np.load('D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\wav2vec2.0\\features400.npy')
y = np.load('D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\wav2vec2.0\\labels400.npy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest binary classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [20]:
y_pred = clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

# Print the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Test Accuracy: 0.97
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        36
           1       1.00      0.95      0.98        44

    accuracy                           0.97        80
   macro avg       0.97      0.98      0.97        80
weighted avg       0.98      0.97      0.98        80



500 Values (250 Positive, 250 Negative)

In [21]:
X = np.load('D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\wav2vec2.0\\features500.npy')
y = np.load('D:\\IIIT Delhi\\7th Semester\\Courses\\BTP\\wav2vec2.0\\labels500.npy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest binary classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [22]:
y_pred = clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

# Print the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Test Accuracy: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        54
           1       0.98      1.00      0.99        46

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100

