In [None]:
from google.colab import files
uploaded = files.upload()  # Click “Choose Files” and select your 4 WAV files


In [None]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
import torch
import librosa
import numpy as np

# Load feature extractor instead of processor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Model loaded on", device)


In [None]:
def predict_emotion(audio_file, noise_level=0.02):
    # Load audio
    speech, sr = librosa.load(audio_file, sr=16000)

    # Add noise dynamically
    noise = np.random.randn(len(speech))
    speech_noisy = np.clip(speech + noise_level * noise, -1.0, 1.0)

    # Prepare input
    inputs = feature_extractor(speech_noisy, sampling_rate=16000, return_tensors="pt").to(device)

    # Predict
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_id = torch.argmax(logits, dim=-1).item()

    labels = ["neutral", "happy", "sad", "angry"]  # adjust to model config if needed
    return labels[predicted_id]


In [None]:
!pip install gradio --quiet


In [None]:
import gradio as gr
import matplotlib.pyplot as plt
import librosa
import librosa.display
import numpy as np
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification

# --------------------------
# 1️⃣ Load Pretrained Model
# --------------------------
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# --------------------------
# 2️⃣ Noise Function
# --------------------------
def add_noise(audio, noise_level=0.02):
    noise = np.random.randn(len(audio))
    audio_noisy = np.clip(audio + noise_level * noise, -1.0, 1.0)
    return audio_noisy

# --------------------------
# 3️⃣ Prediction Function
# --------------------------
def predict_emotion(audio_file, noise_level):
    # Load audio
    speech, sr = librosa.load(audio_file, sr=16000)
    speech_noisy = add_noise(speech, noise_level)

    # Prepare input
    inputs = feature_extractor(speech_noisy, sampling_rate=16000, return_tensors="pt").to(device)

    # Predict
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_id = torch.argmax(logits, dim=-1).item()
    labels = ["neutral", "happy", "sad", "angry"]  # adjust if needed
    return labels[predicted_id], speech, sr

# --------------------------
# 4️⃣ Waveform Plot Function
# --------------------------
def plot_waveform(speech, sr):
    plt.figure(figsize=(10, 3))
    librosa.display.waveshow(speech, sr=sr)
    plt.title("Audio Waveform")
    plt.tight_layout()
    plt.show()
    return plt.gcf()  # Return figure for Gradio to display

# --------------------------
# 5️⃣ Gradio Interface
# --------------------------
def gradio_interface(audio_file, noise_level):
    emotion, speech, sr = predict_emotion(audio_file, noise_level)
    fig = plot_waveform(speech, sr)
    return emotion, fig

ui = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Audio(label="Upload Audio", type="filepath"),
        gr.Slider(minimum=0.0, maximum=0.1, value=0.02, step=0.01, label="Noise Level")
    ],
    outputs=[
        "text",  # Predicted emotion
        "plot"   # Waveform
    ],
    title="Noise-Robust Speech Emotion Recognition",
    description="Upload a WAV file, adjust noise level, see predicted emotion and waveform."
)

ui.launch()
