In [1]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Parameters
SAMPLE_RATE = 16000
N_MFCC = 40
DURATION = 3  # seconds
MAX_LEN = SAMPLE_RATE * DURATION

In [3]:
# 1. Dataset Class
class VoiceDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        
        y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
        y = librosa.util.fix_length(y,  size=MAX_LEN)  # pad or truncate
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
        mfcc = torch.tensor(mfcc, dtype=torch.float32)

        return mfcc.unsqueeze(0), torch.tensor(label, dtype=torch.long)

In [4]:
# 2. Load File Paths
def load_data(data_dir):
    file_paths = []
    labels = []
    for label_name in os.listdir(data_dir):
        label_path = os.path.join(data_dir, label_name)
        for file_name in os.listdir(label_path):
            if file_name.endswith(".wav"):
                file_paths.append(os.path.join(label_path, file_name))
                labels.append(label_name)
    return file_paths, labels

In [5]:
# 3. Prepare Data
file_paths, labels = load_data("data")
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

train_paths, test_paths, train_labels, test_labels = train_test_split(
    file_paths, encoded_labels, test_size=0.2, random_state=42
)

train_dataset = VoiceDataset(train_paths, train_labels)
test_dataset = VoiceDataset(test_paths, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [6]:
# 4. Define Model
class VoiceClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        
        # Placeholder to set later
        self.flatten_dim = None
        self.fc1 = None
        self.fc2 = nn.Linear(64, 2)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # e.g. [B, 16, 20, 44]
        x = self.pool(torch.relu(self.conv2(x)))  # e.g. [B, 32, 10, 22]

        if self.fc1 is None:
            self.flatten_dim = x.view(x.size(0), -1).size(1)
            self.fc1 = nn.Linear(self.flatten_dim, 64).to(x.device)

        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)


In [7]:
# 5. Training Loop
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [8]:
# 6. Evaluation
def evaluate(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    return correct / total

In [9]:
# 7. Run Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)
model = VoiceClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    acc = evaluate(model, test_loader, device)
    print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}, Test Acc: {acc:.4f}")

Device: cpu
Epoch 1, Loss: 0.4575, Test Acc: 0.9783
Epoch 2, Loss: 0.0847, Test Acc: 1.0000
Epoch 3, Loss: 0.0317, Test Acc: 1.0000
Epoch 4, Loss: 0.0162, Test Acc: 1.0000
Epoch 5, Loss: 0.0101, Test Acc: 1.0000
Epoch 6, Loss: 0.0071, Test Acc: 1.0000
Epoch 7, Loss: 0.0057, Test Acc: 1.0000
Epoch 8, Loss: 0.0047, Test Acc: 1.0000
Epoch 9, Loss: 0.0040, Test Acc: 1.0000
Epoch 10, Loss: 0.0035, Test Acc: 1.0000


In [10]:
def predict_audio(file_path, model, label_encoder, device):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    y = librosa.util.fix_length(y, size=MAX_LEN)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
    mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(mfcc_tensor)
        predicted_class = torch.argmax(outputs, dim=1).item()
        class_label = label_encoder.inverse_transform([predicted_class])[0]
        confidence = torch.softmax(outputs, dim=1)[0][predicted_class].item()

    return class_label, confidence


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

file_path = "data/human/00f63fe8-8b98-4988-bbf4-a3c869e026ad.wav"
label, conf = predict_audio(file_path, model, label_encoder, device)
print(f"Prediction: {label} (confidence: {conf:.2f})")


Prediction: human (confidence: 1.00)


In [14]:
torch.save(model.state_dict, 'audo.pth')

In [19]:
model = VoiceClassifier()
model.load_state_dict(torch.load("voice_classifier.pth"))

<All keys matched successfully>

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

file_path = "data/human/00f63fe8-8b98-4988-bbf4-a3c869e026ad.wav"
label, conf = predict_audio(file_path, model, label_encoder, device)
print(f"Prediction: {label} (confidence: {conf:.2f})")

Prediction: human (confidence: 0.96)


In [15]:
!ls

 Assamese.tar		   sample_voice_data
'Audio Classifier.ipynb'  'Speech Dataset of Human and AI-Generated Voices'
 audo.pth		  'Speech Dataset of Human and AI-Generated Voices.zip'
 data			   voice_classifier.pth
 ivr-audio-prompts


In [17]:
!ls -la

total 1430708
drwxr-xr-x 7 mithlesh.kumar2 domain users       4096 May  5 13:07  .
drwxr-xr-x 5 mithlesh.kumar2 domain users       4096 May  5 10:14  ..
-rw-r--r-- 1 mithlesh.kumar2 domain users          0 May  5 12:48  Assamese.tar
-rw-r--r-- 1 mithlesh.kumar2 domain users       9096 May  5 12:58 'Audio Classifier.ipynb'
-rw-r--r-- 1 mithlesh.kumar2 domain users    1909091 May  5 13:07  audo.pth
drwxr-xr-x 4 mithlesh.kumar2 domain users       4096 May  5 11:01  data
drwxr-xr-x 2 mithlesh.kumar2 domain users       4096 May  5 11:38  .ipynb_checkpoints
drwxr-xr-x 3 mithlesh.kumar2 domain users       4096 May  5 12:52  ivr-audio-prompts
drwxr-xr-x 5 mithlesh.kumar2 domain users       4096 May  5 12:36  sample_voice_data
drwxr-xr-x 3 mithlesh.kumar2 domain users       4096 May  5 11:42 'Speech Dataset of Human and AI-Generated Voices'
-rw-r--r-- 1 mithlesh.kumar2 domain users 1463059367 May  5 10:34 'Speech Dataset of Human and AI-Generated Voices.zip'
-rw-r--r-- 1 mithlesh.ku