# Understanding the Interface class

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for Transformers, datasets, evaluation, and Gradio interface
!uv pip install datasets evaluate transformers[sentencepiece]
!uv pip install gradio

In [None]:
# Audio processing interface - demonstrates how to work with audio inputs
# This function reverses audio data by flipping the numpy array
import numpy as np
import gradio as gr


def reverse_audio(audio):
    # audio is a tuple: (sample_rate, audio_data)
    sr, data = audio
    # np.flipud reverses the audio data array, creating a backwards audio effect
    reversed_audio = (sr, np.flipud(data))
    return reversed_audio


# Create an audio input component
# source="microphone": allows real-time recording from user's microphone
# type="numpy": returns audio as numpy array for processing
# label: descriptive text for the input
mic = gr.Audio(source="microphone", type="numpy", label="Speak here...")

# Interface that takes audio input and outputs processed audio
gr.Interface(reverse_audio, mic, "audio").launch()

In [None]:
# Music tone generator - demonstrates multiple input types and audio synthesis
# This creates musical tones based on musical note, octave, and duration parameters
import numpy as np
import gradio as gr

# Musical notes in chromatic scale starting from C
notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]


def generate_tone(note, octave, duration):
    # Audio settings
    sr = 48000  # Sample rate: 48kHz for high-quality audio
    
    # Calculate frequency using equal temperament tuning
    # A4 (A in 4th octave) = 440 Hz is the reference frequency
    a4_freq, tones_from_a4 = 440, 12 * (octave - 4) + (note - 9)
    # Each semitone is 2^(1/12) times the frequency of the previous note
    frequency = a4_freq * 2 ** (tones_from_a4 / 12)
    
    # Generate time array for the duration
    duration = int(duration)
    audio = np.linspace(0, duration, duration * sr)
    
    # Generate sine wave tone at calculated frequency
    # 20000 amplitude for audible volume, convert to 16-bit integer format
    audio = (20000 * np.sin(audio * (2 * np.pi * frequency))).astype(np.int16)
    return (sr, audio)


# Create interface with multiple input types:
gr.Interface(
    generate_tone,
    [
        # Dropdown for note selection, returns index of selected note
        gr.Dropdown(notes, type="index"),
        # Slider for octave selection (4-6 range covers most common octaves)
        gr.Slider(minimum=4, maximum=6, step=1),
        # Text input for duration, expects numeric value
        gr.Textbox(type="number", value=1, label="Duration in seconds"),
    ],
    "audio",  # Output type is audio
).launch()

In [None]:
# Speech-to-text interface - demonstrates automatic speech recognition
# This combines audio processing with NLP pipelines for transcription
from transformers import pipeline
import gradio as gr

# Load automatic speech recognition model
# facebook/wav2vec2-base-960h is a popular ASR model trained on 960 hours of speech
model = pipeline("automatic-speech-recognition")


def transcribe_audio(mic=None, file=None):
    # Handle two possible audio sources: microphone recording or uploaded file
    # This flexibility allows users to either record live or upload existing audio
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    
    # Transcribe audio to text using the ASR pipeline
    # The pipeline handles audio preprocessing and model inference automatically
    transcription = model(audio)["text"]
    return transcription


# Create interface with two optional audio inputs
gr.Interface(
    fn=transcribe_audio,
    inputs=[
        # Microphone input: records audio and saves as temporary file
        gr.Audio(source="microphone", type="filepath", optional=True),
        # File upload input: allows users to upload existing audio files
        gr.Audio(source="upload", type="filepath", optional=True),
    ],
    outputs="text",  # Output the transcribed text
).launch()