In [2]:
import os
import glob
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from multiprocessing import Pool
import os, glob, numpy as np

ModuleNotFoundError: No module named 'torch'

In [2]:
# --- Select device and print it ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[INFO] Using device: {device}")

[INFO] Using device: cuda


In [3]:
# First, make sure you have Polars installed:
# !pip install polars

import os, glob, time, threading
import numpy as np
import polars as pl
from concurrent.futures import ThreadPoolExecutor, as_completed

def _load_file_polars(fp_lbl):
    """
    Read one CSV with Polars (uses its own thread pool),
    extract 'magnitude' as a numpy float32 array.
    """
    fp, lbl = fp_lbl
    df = pl.read_csv(fp, columns=["magnitude"])
    df = df.with_columns(pl.col("magnitude").cast(pl.UInt32))
    return df["magnitude"].to_numpy(), lbl

def load_magnitude_data(base_dir, num_workers=None):
    """
    Notebook-compatible, fully parallel loader with 2-second status prints.
    
    Args:
      base_dir (str): root dir containing 'fake/' and 'real/' subfolders.
      num_workers (int, optional): max threads; defaults to all CPU cores.
    
    Returns:
      X (np.ndarray): stacked [n_samples, …] float32
      y (np.ndarray): [n_samples] int64 labels (0=fake,1=real)
    """
    cpu_cores = os.cpu_count() or 1
    workers = num_workers or cpu_cores

    # 1) discover files
    file_list = [
        (fp, lbl)
        for lbl, cls in enumerate(["fake", "real"])
        for fp in glob.glob(os.path.join(base_dir, cls, "*.csv"))
    ]
    total = len(file_list)
    print(f"[SETUP] Found {total} CSV files; using {workers} threads")

    # 2) progress printer
    count = 0
    stop_evt = threading.Event()
    def _printer():
        while not stop_evt.is_set():
            time.sleep(2)
            print(f"[PROGRESS] Loaded {count}/{total} files")
    threading.Thread(target=_printer, daemon=True).start()

    # 3) parallel load
    X_parts, y_parts = [], []
    t0 = time.time()
    with ThreadPoolExecutor(max_workers=workers) as exe:
        futures = {exe.submit(_load_file_polars, item): item for item in file_list}
        for future in as_completed(futures):
            arr, lbl = future.result()
            X_parts.append(arr)
            y_parts.append(lbl)
            count += 1

    # 4) wrap up
    stop_evt.set()
    elapsed = time.time() - t0
    X = np.vstack(X_parts)
    y = np.array(y_parts, dtype=np.int64)
    print(f"[DONE] {elapsed:.1f}s → X.shape={X.shape}, y.shape={y.shape}")
    return X, y

In [4]:
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

In [5]:
def make_loader(X, y, batch_size=32, shuffle=False):
    dataset = TensorDataset(torch.from_numpy(X), torch.from_numpy(y).unsqueeze(1).float())
    loader  = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    print(f"[DATALOADER] Created loader with {len(loader)} batches of size {batch_size}")
    return loader

In [6]:
 print("=== LOADING DATA ===")
X_train, y_train = load_magnitude_data('./for-norm/for-norm/training', num_workers=50)
X_val,   y_val   = load_magnitude_data('./for-norm/for-norm/validation', num_workers=50)
X_test,  y_test  = load_magnitude_data('./for-norm/for-norm/testing', num_workers=50)

=== LOADING DATA ===
[SETUP] Found 53868 CSV files; using 50 threads
[PROGRESS] Loaded 1861/53868 files
[PROGRESS] Loaded 5989/53868 files
[PROGRESS] Loaded 10091/53868 files
[PROGRESS] Loaded 14193/53868 files
[PROGRESS] Loaded 18303/53868 files
[PROGRESS] Loaded 22338/53868 files
[PROGRESS] Loaded 26390/53868 files
[PROGRESS] Loaded 30436/53868 files
[PROGRESS] Loaded 34508/53868 files
[PROGRESS] Loaded 38475/53868 files
[PROGRESS] Loaded 42502/53868 files
[PROGRESS] Loaded 44736/53868 files
[PROGRESS] Loaded 45110/53868 files
[PROGRESS] Loaded 45503/53868 files
[PROGRESS] Loaded 45984/53868 files
[PROGRESS] Loaded 46322/53868 files
[PROGRESS] Loaded 46695/53868 files
[PROGRESS] Loaded 47033/53868 files
[PROGRESS] Loaded 47381/53868 files
[PROGRESS] Loaded 47711/53868 files
[PROGRESS] Loaded 48056/53868 files
[PROGRESS] Loaded 48476/53868 files
[PROGRESS] Loaded 48818/53868 files
[PROGRESS] Loaded 49259/53868 files
[PROGRESS] Loaded 49752/53868 files
[PROGRESS] Loaded 50260/53868 fil

In [12]:
print("=== SCALING FEATURES ===")
scaler   = StandardScaler()
X_train = scaler.fit_transform(X_train).astype(np.float32)
X_val   = scaler.transform(X_val).astype(np.float32)
X_test  = scaler.transform(X_test).astype(np.float32)
print(f"[SCALE] Feature means (first 5): {np.round(scaler.mean_[:5], 5)}")
print(f"[SCALE] Feature stddevs (first 5): {np.round(np.sqrt(scaler.var_[:5]), 5)}\n")

=== SCALING FEATURES ===
[SCALE] Feature means (first 5): [ 0.  0. -0.  0. -0.]
[SCALE] Feature stddevs (first 5): [1. 1. 1. 1. 1.]



In [27]:
print("=== PREPARING DATALOADERS ===")
train_loader = make_loader(X_train, y_train, batch_size=32, shuffle=True)
val_loader   = make_loader(X_val,   y_val,   batch_size=32, shuffle=False)
test_loader  = make_loader(X_test,  y_test,  batch_size=32, shuffle=False)
print()

=== PREPARING DATALOADERS ===
[DATALOADER] Created loader with 1684 batches of size 32
[DATALOADER] Created loader with 338 batches of size 32
[DATALOADER] Created loader with 145 batches of size 32



In [28]:
print("=== BUILDING MODEL ===")
model     = VoiceClassifier(X_train.shape[1]).to(device)
print(model)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
print()

=== BUILDING MODEL ===
VoiceClassifier(
  (net): Sequential(
    (0): Linear(in_features=39961, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=4096, out_features=1024, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=1024, out_features=64, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Linear(in_features=64, out_features=1, bias=True)
    (10): Sigmoid()
  )
)



In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True   # enable cuDNN auto-tuner for best performance
print(f"[INFO] Using device: {device}")

[INFO] Using device: cuda


In [30]:
# Training loop with verbose printouts
epochs = 20
for epoch in range(1, epochs+1):
    print(f"\n=== Epoch {epoch}/{epochs} ===")
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    # ---- Training ----
    for batch_idx, (xb, yb) in enumerate(train_loader, start=1):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss  = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)
        correct      += ((preds >= 0.5) == yb).sum().item()
        total        += yb.size(0)

        # Print batch progress every 64 batches
        if batch_idx % 10 == 0 or batch_idx == len(train_loader):
            avg_loss = running_loss / total
            avg_acc  = correct / total
            print(f"[TRAIN] Batch {batch_idx}/{len(train_loader)} - "
                  f"Avg Loss: {avg_loss:.4f}, Avg Acc: {avg_acc:.4f}")

    train_loss = running_loss / total
    train_acc  = correct / total
    print(f"[TRAIN] Epoch {epoch} summary: Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")

    # ---- Validation ----
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for batch_idx, (xb, yb) in enumerate(val_loader, start=1):
            xb, yb = xb.to(device), yb.to(device)
            preds   = model(xb)
            loss    = criterion(preds, yb)
            val_loss += loss.item() * xb.size(0)
            correct  += ((preds >= 0.5) == yb).sum().item()
            total    += yb.size(0)

            # Print validation progress every 10 batches
            if batch_idx % 10 == 0 or batch_idx == len(val_loader):
                avg_vloss = val_loss / total
                avg_vacc  = correct / total
                print(f"[VALID] Batch {batch_idx}/{len(val_loader)} - "
                      f"Avg Loss: {avg_vloss:.4f}, Avg Acc: {avg_vacc:.4f}")

    val_loss /= total
    val_acc   = correct / total
    print(f"[VALID] Epoch {epoch} summary: Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")


# Test
model.eval()
test_loss, correct, total = 0, 0, 0
with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        preds    = model(xb)
        loss     = criterion(preds, yb)
        test_loss += loss.item() * xb.size(0)
        correct   += ((preds >= 0.5) == yb).sum().item()
        total     += yb.size(0)
test_loss /= total
test_acc   = correct / total
print(f"\nTest Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

# Save
torch.save(model.state_dict(), 'voice_magnitude_classifier.pth')
print("Model saved to voice_magnitude_classifier.pth")


=== Epoch 1/20 ===
[TRAIN] Batch 10/1684 - Avg Loss: 21.1243, Avg Acc: 0.5469
[TRAIN] Batch 20/1684 - Avg Loss: 26.1742, Avg Acc: 0.5609
[TRAIN] Batch 30/1684 - Avg Loss: 30.8561, Avg Acc: 0.5531
[TRAIN] Batch 40/1684 - Avg Loss: 33.8437, Avg Acc: 0.5477
[TRAIN] Batch 50/1684 - Avg Loss: 35.7949, Avg Acc: 0.5406
[TRAIN] Batch 60/1684 - Avg Loss: 35.3140, Avg Acc: 0.5469
[TRAIN] Batch 70/1684 - Avg Loss: 34.8630, Avg Acc: 0.5500
[TRAIN] Batch 80/1684 - Avg Loss: 34.6031, Avg Acc: 0.5488
[TRAIN] Batch 90/1684 - Avg Loss: 35.8709, Avg Acc: 0.5417
[TRAIN] Batch 100/1684 - Avg Loss: 36.2780, Avg Acc: 0.5419
[TRAIN] Batch 110/1684 - Avg Loss: 36.2081, Avg Acc: 0.5437
[TRAIN] Batch 120/1684 - Avg Loss: 36.1214, Avg Acc: 0.5437
[TRAIN] Batch 130/1684 - Avg Loss: 35.1385, Avg Acc: 0.5474
[TRAIN] Batch 140/1684 - Avg Loss: 33.6574, Avg Acc: 0.5516
[TRAIN] Batch 150/1684 - Avg Loss: 32.1388, Avg Acc: 0.5600
[TRAIN] Batch 160/1684 - Avg Loss: 31.0232, Avg Acc: 0.5621
[TRAIN] Batch 170/1684 - Avg 

KeyboardInterrupt: 

In [None]:
print("=== EVALUATING ON TEST SET ===")
model.eval()
test_loss = 0.0
test_correct = 0
test_total = 0
with torch.no_grad():
    for batch_idx, (xb, yb) in enumerate(test_loader, 1):
        xb, yb = xb.to(device), yb.to(device)
        outputs = model(xb)
        loss = criterion(outputs, yb)
        test_loss += loss.item() * xb.size(0)
        preds = (outputs >= 0.5).float()
        test_correct += (preds == yb).sum().item()
        test_total += yb.size(0)
        print(f"[TEST] Batch {batch_idx}/{len(test_loader)} - Loss: {loss.item():.4f}")
test_loss /= test_total
test_acc = test_correct / test_total
print(f"[TEST] → Final Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}\n")

print("=== SAVING MODEL ===")
torch.save(model.state_dict(), 'voice_magnitude_classifier.pth')
print("[INFO] Model saved to voice_magnitude_classifier.pth")

In [53]:
import torch
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

# assume:
#   model       – your trained PyTorch model
#   test_loader – DataLoader for your test set
#   device      – 'cuda' or 'cpu'

model.eval()
y_true = []
y_pred = []
y_prob = []

with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        out = model(xb).view(-1)       # shape (batch,)
        probs = out.cpu().numpy()      # probabilities
        preds = (probs >= 0.5).astype(int)
        y_true.extend(yb.cpu().numpy())
        y_pred.extend(preds)
        y_prob.extend(probs)

y_true = np.array(y_true)
y_pred = np.array(y_pred)
y_prob = np.array(y_prob)

# Calculate metrics
acc    = accuracy_score(y_true, y_pred)
prec   = precision_score(y_true, y_pred)
rec    = recall_score(y_true, y_pred)
f1     = f1_score(y_true, y_pred)
auc    = roc_auc_score(y_true, y_prob)
cm     = confusion_matrix(y_true, y_pred)
report = classification_report(y_true, y_pred, target_names=['fake','real'])

# Print them
print("=== Evaluation Metrics ===")
print(f"Accuracy      : {acc:.4f}")
print(f"Precision     : {prec:.4f}")
print(f"Recall        : {rec:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"ROC AUC Score : {auc:.4f}\n")

print("Confusion Matrix:")
print(cm, "\n")

print("Classification Report:")
print(report)


=== Evaluation Metrics ===
Accuracy      : 0.7381
Precision     : 0.6650
Recall        : 0.9596
F1-Score      : 0.7856
ROC AUC Score : 0.9057

Confusion Matrix:
[[281 263]
 [ 22 522]] 

Classification Report:
              precision    recall  f1-score   support

        fake       0.93      0.52      0.66       544
        real       0.66      0.96      0.79       544

    accuracy                           0.74      1088
   macro avg       0.80      0.74      0.72      1088
weighted avg       0.80      0.74      0.72      1088



In [None]:
#!/usr/bin/env python3
import os, glob
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1) MODEL DEFINITION
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim=16000):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),

            nn.Linear(256, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),

            nn.Linear(64, 1)  # logits out
        )
    def forward(self, x):
        return self.net(x).squeeze(1)

# 2) DATA LOADING
def load_mag(dirpath):
    X, y = [], []
    for lbl, cls in enumerate(['fake','real']):
        for f in glob.glob(os.path.join(dirpath,cls,'*.csv')):
            df = pd.read_csv(f, usecols=['magnitude'])
            X.append(df['magnitude'].values)
            y.append(lbl)
    return np.vstack(X).astype(np.float32), np.array(y)

X_train, y_train = load_mag('./training')
X_val,   y_val   = load_mag('./validation')
X_test,  y_test  = load_mag('./testing')

# 3) SCALE
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

# 4) BALANCED SAMPLER
class_counts = np.bincount(y_train)
class_weights= 1. / class_counts
sample_weights = class_weights[y_train]
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

def make_loader(X,y, batch, sampler=None):
    tx = torch.from_numpy(X)
    ty = torch.from_numpy(y).float()
    ds = TensorDataset(tx, ty)
    return DataLoader(ds, batch, shuffle=(sampler is None), sampler=sampler)

train_loader = make_loader(X_train, y_train, batch=64, sampler=sampler)
val_loader   = make_loader(X_val,   y_val,   batch=64)
test_loader  = make_loader(X_test,  y_test,  batch=64)

# 5) SETUP
device    = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model     = VoiceClassifier(X_train.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss() 
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=3, verbose=True)

# 6) TRAIN/VAL LOOP WITH EARLY STOPPING
best_val_acc, patience, counter = 0.0, 5, 0
for epoch in range(1, 31):
    # — TRAIN —
    model.train()
    all_preds, all_labels = [], []
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss   = criterion(logits, yb)
        loss.backward(); optimizer.step()
        preds = (torch.sigmoid(logits)>=0.5).long()
        all_preds.append(preds.cpu().numpy()); all_labels.append(yb.cpu().numpy())
    train_acc = accuracy_score(np.concatenate(all_labels), 
                               np.concatenate(all_preds))

    # — VALIDATE —
    model.eval()
    v_preds, v_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            preds  = (torch.sigmoid(logits)>=0.5).long()
            v_preds.append(preds.cpu().numpy()); v_labels.append(yb.cpu().numpy())
    v_preds = np.concatenate(v_preds); v_labels = np.concatenate(v_labels)
    val_acc = accuracy_score(v_labels, v_preds)

    print(f"Epoch {epoch:02d} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    scheduler.step(val_acc)
    if val_acc > best_val_acc:
        best_val_acc = val_acc; counter = 0
        torch.save(model.state_dict(),'best.pth')
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping!") 
            break

# 7) TEST WITH BEST MODEL
model.load_state_dict(torch.load('best.pth'))
model.eval()
t_preds, t_labels, t_probs = [], [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        probs  = torch.sigmoid(logits).cpu().numpy()
        preds  = (probs>=0.5).astype(int)
        t_probs.append(probs); t_preds.append(preds); t_labels.append(yb.cpu().numpy())

t_preds  = np.concatenate(t_preds)
t_labels = np.concatenate(t_labels)
t_probs  = np.concatenate(t_probs)

print("\n=== Test Metrics ===")
print("Accuracy :", accuracy_score(t_labels, t_preds))
print("Precision:", precision_score(t_labels, t_preds))
print("Recall   :", recall_score(t_labels, t_preds))
print("F1-score :", f1_score(t_labels, t_preds))
