In [None]:
from google.colab import drive
drive.mount('/content/drive')

zip_path="/content/drive/MyDrive/L2_DATASET.zip"

import zipfile,os,glob
import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt
from IPython.display import Audio, display

# --- unzip ---
extract_path="/content/voices_dataset"
os.makedirs(extract_path,exist_ok=True)
with zipfile.ZipFile(zip_path,'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset unzipped to:",extract_path)
print("Files:",os.listdir(extract_path))

print("Extracted folder content:")
for r,ds,fs in os.walk(extract_path):
    print(r,"→",fs)

#  collect WAV files ---
audio_dir=os.path.join(extract_path,"L2_DATASET")
all_audio=sorted(glob.glob(os.path.join(audio_dir,"*.wav")))
print("Available audio files:")
for f in all_audio: print(os.path.basename(f))
assert all_audio, "No WAV files found. Check /content/voices_dataset/L2_DATASET"

# Helper: analyze and plot pitch histogram ----
def plot_pitch_histogram(file_path, show=False, save_path=None):
    y, sr = librosa.load(file_path, sr=None, mono=True)

    # Pitch estimation using piptrack
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch_values = pitches[magnitudes > np.median(magnitudes)]

    # Safety check to avoid empty array errors
    if len(pitch_values) == 0:
        print(f" No pitch values found for: {os.path.basename(file_path)}")
        return

    # Plot histogram
    plt.figure(figsize=(8, 4))
    plt.hist(pitch_values, bins=50, color='teal', alpha=0.7)
    plt.title(f"Estimated Fundamental Frequencies — {os.path.basename(file_path)}")
    plt.xlabel("Frequency [Hz]")
    plt.ylabel("Count")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        plt.close()
    elif show:
        plt.show()
    else:
        plt.close()

    # Print median pitch
    print(f"{os.path.basename(file_path)} → Median Pitch: {np.median(pitch_values):.1f} Hz")

#  3) Show pitch histogram for the FIRST file ----
first_file = all_audio[0]
y, sr = librosa.load(first_file, sr=None, mono=True)
display(Audio(y, rate=sr))
plot_pitch_histogram(first_file, show=True)

# 4) Loop over ALL files and save pitch plots ----
out_dir = os.path.join(extract_path, "pitch_plots")
os.makedirs(out_dir, exist_ok=True)

for fp in all_audio:
    png_name = os.path.splitext(os.path.basename(fp))[0] + "_pitch.png"
    save_to = os.path.join(out_dir, png_name)
    plot_pitch_histogram(fp, save_path=save_to)

print(f"\n Saved {len(all_audio)} pitch histograms to: {out_dir}")
