# Mini OpenL3 + Pinecone Demo
Sanitized demo notebook. Full pipeline (large dataset, extended evaluation) is private.

In [None]:
import os, numpy as np, pandas as pd
import openl3, librosa
from dotenv import load_dotenv
from src.preprocess import Preprocess
from src.pinecone_client import PineConeVDB
from src.vector_utils import create_vectors
from src.viz import tsne_plot, similarity_heatmap
load_dotenv()
API_KEY = os.getenv('PINECONE_API_KEY') or 'REPLACE_ME'
assert API_KEY != 'REPLACE_ME', 'Set PINECONE_API_KEY in .env'

## 1. Prepare sample audio & preprocessing

In [None]:
sample_dir = 'data/sample_audio'
pre = Preprocess(sample_dir)
sample_files = [f for f in os.listdir(sample_dir) if f.lower().endswith('.wav')]
print('Found files:', sample_files)

## 2. Extract embeddings (OpenL3)

In [None]:
rows = []
for f in sample_files:
    path = os.path.join(sample_dir, f)
    y, sr = pre.preprocess_audio_file(path)
    if y is None: continue
    emb, ts = openl3.get_audio_embedding(y, sr, input_repr='mel256', content_type='music', embedding_size=512)
    vec = emb.mean(axis=0)
    rows.append({'name': f.split('_')[0], 'surah_number': 0, 'ayah_number': 0, 'features': vec})
df = pd.DataFrame(rows)
df.head()

## 3. Create vectors and upsert to Pinecone

In [None]:
from src.vector_utils import create_vectors
from src.pinecone_client import PineConeVDB
pc_vdb = PineConeVDB(api_key=API_KEY, index_name='demo-openl3', dimensions=512)
vectors = create_vectors(df[['features']], df[['name','surah_number','ayah_number']])
pc_vdb.upsert_vectors(vectors)
print('Upserted', len(vectors), 'vectors')

## 4. Simple query

In [None]:
test_vec = df.iloc[0]['features']
res = pc_vdb.search(test_vec.tolist(), top_k=3)
res

## 5. Visualization (t-SNE & similarity matrix)

In [None]:
if len(df) > 2:
    tsne_plot(df)
    similarity_heatmap(df)

## 6. Notes
- Real dataset + evaluation withheld.
- Replace sample audio with your own small clips to explore.
- For full pipeline access contact the maintainer.