In [None]:
import librosa
import numpy as np
import pandas as pd


# Function to extract features from an audio segment
def extract_audio_features(y, sr):
    features = {}

    # Temporal features
    features["Zero_Crossing_Rate"] = np.mean(librosa.feature.zero_crossing_rate(y=y))
    features["RMS"] = np.mean(librosa.feature.rms(y=y))

    # Frequency-domain features
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    features["Spectral_Centroid"] = np.mean(spectral_centroid)

    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    features["Spectral_Bandwidth"] = np.mean(spectral_bandwidth)

    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    features["Spectral_Rolloff"] = np.mean(spectral_rolloff)

    spectral_flatness = librosa.feature.spectral_flatness(y=y)
    features["Spectral_Flatness"] = np.mean(spectral_flatness)

    # Time-frequency features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    for i in range(1, 14):
        features[f"MFCC_{i}"] = np.mean(mfccs[i - 1])

    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    features["Chroma_Mean"] = np.mean(chroma)

    # Perceptual features
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    features["Pitch_Mean"] = np.mean(pitches)

    # Statistical features on MFCCs
    features["MFCC_Variance"] = np.var(mfccs)

    return features


# Function to generate augmented audio segments
def augment_audio(y, sr):
    augmented_audios = []

    # Time stretching
    augmented_audios.append(librosa.effects.time_stretch(y, rate=1.1))
    augmented_audios.append(librosa.effects.time_stretch(y, rate=0.9))

    # Pitch shifting
    augmented_audios.append(librosa.effects.pitch_shift(y, sr=sr, n_steps=2))
    augmented_audios.append(librosa.effects.pitch_shift(y, sr=sr, n_steps=-2))

    return augmented_audios


# Function to create dataset from a single audio file
def create_dataset_from_single_audio(audio_file, target_rows=1000):
    y, sr = librosa.load(audio_file, sr=None)
    segment_length = len(y) // (target_rows // 5)  # Ensure enough segments

    dataset = []

    # Generate segments and augment
    for i in range(0, len(y), segment_length):
        if len(dataset) >= target_rows:
            break

        segment = y[i : i + segment_length]
        if len(segment) < segment_length:
            continue

        # Original segment features
        dataset.append(extract_audio_features(segment, sr))

        # Augmented features
        for augmented_segment in augment_audio(segment, sr):
            dataset.append(extract_audio_features(augmented_segment, sr))

            if len(dataset) >= target_rows:
                break

    return pd.DataFrame(dataset[:target_rows])


# Example usage
if __name__ == "__main__":
    audio_file = "./Audio.mp3"  # Replace with your audio file path

    # Generate dataset
    audio_df = create_dataset_from_single_audio(audio_file, target_rows=1000)

    # Save to CSV
    audio_df.to_csv("audio_features_dataset.csv", index=False)

    # Display the DataFrame
    audio_df