In [None]:
!pip install torch torchaudio transformers librosa scikit-learn numpy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import os
import numpy as np
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, HubertModel
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load HuBERT model
model_name = "facebook/hubert-base-ls960"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = HubertModel.from_pretrained(model_name)

# Dataset paths
DATASET_PATH = "/content/drive/MyDrive/Pneumonia_Project/Datasets/Audio/Audio_Final_Dataset"
PNEUMONIA_PATH = os.path.join(DATASET_PATH, "Pneumonia")
NON_PNEUMONIA_PATH = os.path.join(DATASET_PATH, "Non-Pneumonia")

def extract_features(audio_file):
    """Extract HuBERT embeddings from an audio file."""
    try:
        audio, sr = librosa.load(audio_file, sr=16000)
        input_values = feature_extractor(audio, sampling_rate=16000, return_tensors="pt").input_values
        with torch.no_grad():
            outputs = model(input_values).last_hidden_state
        return torch.mean(outputs, dim=1).squeeze().numpy()  # Average pooling
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None

# Load dataset
def load_dataset():
    data, labels = [], []

    for label, folder in enumerate([NON_PNEUMONIA_PATH, PNEUMONIA_PATH]):
        for file in os.listdir(folder):
            if file.endswith(".wav"):
                file_path = os.path.join(folder, file)
                embedding = extract_features(file_path)
                if embedding is not None:
                    data.append(embedding)
                    labels.append(label)

    return np.array(data), np.array(labels)

# Prepare data
X, y = load_dataset()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier (SVM)
svm = SVC(kernel="linear", probability=True)
svm.fit(X_train, y_train)

# Evaluate model
y_pred = svm.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

  normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
  ret = ret.dtype.type(ret / rcount)
  normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Error processing /content/drive/MyDrive/Pneumonia_Project/Datasets/Audio/Audio_Final_Dataset/Pneumonia/6T43bddKoKfG7MwnJWvrPZSsyrc2_breathing-shallow.wav: Calculated padded input size per channel: (0). Kernel size: (10). Kernel size can't be greater than actual input size
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       244
           1       0.82      0.64      0.72        14

    accuracy                           0.97       258
   macro avg       0.90      0.82      0.85       258
weighted avg       0.97      0.97      0.97       258

Accuracy: 0.9728682170542635
