In [6]:
import numpy as np
import librosa
import torch
import torch.nn as nn
from torchvision.models import resnet18
import argparse

# Audio preprocessing parameters
SAMPLE_RATE = 16000
DURATION = 3.0
N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

# Define the model architecture (must match training)
class AudioClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
    
    def forward(self, x):
        return self.resnet(x)

def preprocess_audio(file_path):
    # Load audio
    try:
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        audio = np.zeros(int(DURATION * SAMPLE_RATE))
        sr = SAMPLE_RATE
    
    # Pad or trim to fixed length
    target_length = int(DURATION * SAMPLE_RATE)
    if len(audio) < target_length:
        audio = np.pad(audio, (0, target_length - len(audio)))
    else:
        audio = audio[:target_length]
    
    # Convert to mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio, sr=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, n_fft=N_FFT
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize
    mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-8)
    
    # Convert to tensor
    mel_spec_tensor = torch.tensor(mel_spec_db, dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # [1, 1, n_mels, time]
    
    return mel_spec_tensor

def infer_audio(file_path, model_path="best_model.pt"):
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load model
    model = AudioClassifier()
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    
    # Preprocess audio
    input_tensor = preprocess_audio(file_path)
    input_tensor = input_tensor.to(device)
    
    # Perform inference
    with torch.no_grad():
        output = model(input_tensor)
        pred = torch.argmax(output, dim=1).cpu().numpy()[0]
    
    # Return result
    return "FAKE" if pred == 1 else "REAL"

def main():
    parser = argparse.ArgumentParser(description="Perform inference on an audio file using a trained model.")
    parser.add_argument("/home/ub/Downloads/fake.wav", type=str, help="Path to the audio file (.wav)")
    parser.add_argument("--model_path", type=str, default="best_model.pt", help="Path to the trained model file")
    args = parser.parse_args()
    
    result = infer_audio(args.audio_file, args.model_path)
    print(f"Prediction for {args.audio_file}: {result}")

if __name__ == "__main__":
    main()

AttributeError: partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [8]:
import numpy as np
import librosa
import torch
import torch.nn as nn
from torchvision.models import resnet18
import argparse
import os

# Audio preprocessing parameters
SAMPLE_RATE = 16000
DURATION = 3.0
N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

# Define the model architecture (must match training)
class AudioClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
    
    def forward(self, x):
        return self.resnet(x)

def preprocess_audio(file_path="/home/ub/Downloads/fake.wav"):
    # Validate file existence
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Audio file not found: {file_path}")
    
    # Load audio
    try:
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        audio = np.zeros(int(DURATION * SAMPLE_RATE))
        sr = SAMPLE_RATE
    
    # Pad or trim to fixed length
    target_length = int(DURATION * SAMPLE_RATE)
    if len(audio) < target_length:
        audio = np.pad(audio, (0, target_length - len(audio)))
    else:
        audio = audio[:target_length]
    
    # Convert to mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio, sr=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, n_fft=N_FFT
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize
    mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-8)
    
    # Convert to tensor
    mel_spec_tensor = torch.tensor(mel_spec_db, dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # [1, 1, n_mels, time]
    
    return mel_spec_tensor

def infer_audio(file_path, model_path="best_model.pt"):
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load model
    try:
        model = AudioClassifier()
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)
        model.eval()
    except Exception as e:
        raise RuntimeError(f"Error loading model from {model_path}: {e}")
    
    # Preprocess audio
    input_tensor = preprocess_audio(file_path)
    input_tensor = input_tensor.to(device)
    
    # Perform inference
    with torch.no_grad():
        output = model(input_tensor)
        pred = torch.argmax(output, dim=1).cpu().numpy()[0]
    
    # Return result
    return "FAKE" if pred == 1 else "REAL"

def main():
    parser = argparse.ArgumentParser(description="Perform inference on an audio file using a trained model.")
    parser.add_argument("audio_file", type=str, help="Path to the audio file (.wav)")
    parser.add_argument("--model_path", type=str, default="best_model.pt", help="Path to the trained model file")
    args = parser.parse_args()
    
    try:
        result = infer_audio(args.audio_file, args.model_path)
        print(f"Prediction for {args.audio_file}: {result}")
    except Exception as e:
        print(f"Error during inference: {e}")

if __name__ == "__main__":
    main()

AttributeError: partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [1]:
import numpy as np
import librosa
import torch
import torch.nn as nn
from torchvision.models import resnet18
import argparse
import os
import sys

# Check PyTorch and Torchvision versions
try:
    print(f"PyTorch version: {torch.__version__}")
    import torchvision
    print(f"Torchvision version: {torchvision.__version__}")
except ImportError as e:
    print(f"Error importing torch or torchvision: {e}")
    print("Please ensure compatible versions are installed, e.g., torch==2.3.0 and torchvision==0.18.0")
    sys.exit(1)

# Audio preprocessing parameters
SAMPLE_RATE = 16000
DURATION = 3.0
N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

# Define the model architecture (must match training)
class AudioClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
    
    def forward(self, x):
        return self.resnet(x)

def preprocess_audio(file_path):
    # Validate file existence
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Audio file not found: {file_path}")
    
    # Load audio
    try:
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        audio = np.zeros(int(DURATION * SAMPLE_RATE))
        sr = SAMPLE_RATE
    
    # Pad or trim to fixed length
    target_length = int(DURATION * SAMPLE_RATE)
    if len(audio) < target_length:
        audio = np.pad(audio, (0, target_length - len(audio)))
    else:
        audio = audio[:target_length]
    
    # Convert to mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio, sr=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, n_fft=N_FFT
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize
    mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-8)
    
    # Convert to tensor
    mel_spec_tensor = torch.tensor(mel_spec_db, dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # [1, 1, n_mels, time]
    
    return mel_spec_tensor

def infer_audio(file_path, model_path="best_model.pt"):
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load model
    try:
        model = AudioClassifier()
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)
        model.eval()
    except Exception as e:
        raise RuntimeError(f"Error loading model from {model_path}: {e}")
    
    # Preprocess audio
    input_tensor = preprocess_audio(file_path)
    input_tensor = input_tensor.to(device)
    
    # Perform inference
    with torch.no_grad():
        output = model(input_tensor)
        pred = torch.argmax(output, dim=1).cpu().numpy()[0]
    
    # Return result
    return "FAKE" if pred == 1 else "REAL"

def main():
    parser = argparse.ArgumentParser(description="Perform inference on an audio file using a trained model.")
    parser.add_argument("audio_file", type=str, help="Path to the audio file (.wav)")
    parser.add_argument("--model_path", type=str, default="best_model.pt", help="Path to the trained model file")
    args = parser.parse_args()
    
    try:
        result = infer_audio(args.audio_file, args.model_path)
        print(f"Prediction for {args.audio_file}: {result}")
    except Exception as e:
        print(f"Error during inference: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 2.3.0+cu121
Torchvision version: 0.18.0+cu121


usage: ipykernel_launcher.py [-h] [--model_path MODEL_PATH] audio_file
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [12]:
pip uninstall torch torchvision torchaudio
pip install torch==2.3.0 torchvision==0.18.0

SyntaxError: invalid syntax (3398162282.py, line 1)

In [None]:
import numpy as np
import librosa
import torch
import torch.nn as nn
from torchvision.models import resnet18
import argparse
import os
import sys

# Check PyTorch and Torchvision versions
try:
    print(f"PyTorch version: {torch.__version__}")
    import torchvision
    print(f"Torchvision version: {torchvision.__version__}")
except ImportError as e:
    print(f"Error importing torch or torchvision: {e}")
    print("Please ensure compatible versions are installed.")
    sys.exit(1)

# Audio preprocessing parameters
SAMPLE_RATE = 16000
DURATION = 3.0
N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

# Define the model architecture (same as training)
class AudioClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
    
    def forward(self, x):
        return self.resnet(x)

def preprocess_audio(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Audio file not found: {file_path}")
    
    try:
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        audio = np.zeros(int(DURATION * SAMPLE_RATE))
    
    target_length = int(DURATION * SAMPLE_RATE)
    if len(audio) < target_length:
        audio = np.pad(audio, (0, target_length - len(audio)))
    else:
        audio = audio[:target_length]
    
    mel_spec = librosa.feature.melspectrogram(
        y=audio, sr=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, n_fft=N_FFT
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-8)
    mel_spec_tensor = torch.tensor(mel_spec_db, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    return mel_spec_tensor

def infer_audio(file_path, model_path="best_model.pt"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    model = AudioClassifier()
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    
    input_tensor = preprocess_audio(file_path).to(device)
    
    with torch.no_grad():
        output = model(input_tensor)
        pred = torch.argmax(output, dim=1).cpu().item()
    
    return "FAKE" if pred == 1 else "REAL"

def main():
    parser = argparse.ArgumentParser(description="Audio inference using a trained model.")
    parser.add_argument("audio_file", type=str, help="Path to the input audio file (.wav)")
    parser.add_argument("--model_path", type=str, default="best_model.pt", help="Path to the model checkpoint")
    args = parser.parse_args()
    
    try:
        result = infer_audio(args.audio_file, args.model_path)
        print(f"Prediction for {args.audio_file}: {result}")
    except Exception as e:
        print(f"Error during inference: {e}")
        sys.exit(1)

# Main entry
if __name__ == "__main__":
    if "ipykernel" in sys.modules:
        # Inside Jupyter/IPython
        test_audio = "path/to/your_audio.wav"  # <-- Replace with real .wav file
        model_path = "best_model.pt"
        result = infer_audio(test_audio, model_path)
        print(f"Prediction for {test_audio}: {result}")
    else:
        # CLI mode
        main()
