In [2]:
from datasets import load_dataset
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd

In [6]:
df = pd.read_parquet('vscode_vectors_bge.parquet')  # optionally add engine='pyarrow' or 'fastparquet'
df.head()

Unnamed: 0,clean lyrics,embedded
0,are you alright i m alright i m quite alright ...,"[-0.09510867297649384, -0.052341993898153305, ..."
1,we pick up in new york city i m tryna find a m...,"[-0.020572615787386894, -0.07993195205926895, ..."
2,baby you a song you make me wanna roll my wind...,"[-0.027859799563884735, -0.03636902943253517, ..."
3,baby last night was hands down one of the best...,"[-0.02230849675834179, -0.0188309233635664, 0...."
4,you make me feel so young you make me feel as ...,"[-0.03834763541817665, -0.03280093893408775, 0..."


In [8]:
array = np.vstack(df["embedded"].values).astype("float32")
array

array([[-9.5108673e-02, -5.2341994e-02,  2.9881362e-02, ...,
         4.3037690e-02, -1.4177671e-02, -2.1796750e-02],
       [-2.0572616e-02, -7.9931952e-02,  4.0240899e-02, ...,
         1.6816258e-02, -7.6525152e-02, -1.5770495e-02],
       [-2.7859800e-02, -3.6369029e-02,  7.4195705e-02, ...,
         3.5594774e-03, -3.9827251e-03,  1.5726753e-02],
       ...,
       [-1.1820935e-04,  1.6659580e-02,  2.2597142e-02, ...,
         8.7751560e-02, -9.5417341e-03, -2.7034365e-02],
       [-7.4566141e-02, -1.4757029e-02,  3.7393596e-02, ...,
         3.9951306e-02,  2.7222736e-03, -4.2417862e-02],
       [-3.5246432e-02, -2.8395848e-02,  2.8976936e-02, ...,
         1.2137246e-01, -4.1237038e-02, -1.7751783e-02]], dtype=float32)

In [10]:
norms = np.linalg.norm(array, axis=1)
all_normalized = np.allclose(norms, 1.0, atol=1e-6) 
print("All vectors normalized:", all_normalized)


All vectors normalized: True


In [12]:
dim = array.shape[1]
dim

384

In [14]:
index = faiss.IndexHNSWFlat(dim, 48, faiss.METRIC_INNER_PRODUCT)
index

<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x000001AC05F2FC90> >

In [16]:
index.hnsw.efConstruction = 500
index.hnsw.efSearch = 64


In [18]:
index.add(array)  
index.ntotal

25000

In [20]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)  # remove [Verse], [Chorus]
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

In [26]:
model_bge = SentenceTransformer("BAAI/bge-small-en-v1.5")

In [28]:
def query_model_bge(text):
    clean_text(text)
    query = model_bge.encode(text, convert_to_numpy=True)
    D, I = index.search(np.expand_dims(query, axis=0), k=5)
    return I, D

In [30]:
I, D = query_model_bge("are you alright i m alright i m quite alright and my money s right countin them bands all way to the top til they be fallin over countin them bands on my way to the top til we fallin over i don t really care if you cry on the real you shoulda never lied shoulda saw the way she looked me in my eyes she said baby i am not afraid to die push me to the edge all my friends are dead push me to the edge all my friends are dead push me to the edge all my friends are dead push me to the edge phantom that s all red inside all white like somethin you ride a sled down i just want that head my brittany got mad i m barely her man now everybody got the same swag now watch the way that i tear it down stackin my bands all the way to the top all the way til my bands fallin over every time that you leave your spot your girlfriend call me like come on over i like the way that she treat me gon leave you won t leave me i call it that casanova she say i m insane yeah i might blow my brain out xanny help the pain yeah please xanny make it go away i m committed not addicted but it keep control of me all the pain now i can t feel it i swear that it s slowin me yeah i don t really care if you cry on the real you shoulda never lied saw the way she looked me in my eyes she said i am not afraid to die all my friends are dead push me to the edge all my friends are dead yeah ooh push me to the edge all my friends are dead yeah all my friends are dead yeah that is not your swag i swear you fake hard now these niggas wanna take my cadence rain on em thunderstorm rain on em medicine lil nigga take some fast car nascar race on em in the club ain t got no ones then we would beg them clothes from overseas got the racks and they all c notes you is not a g though lookin at you stackin all your money it all green though i was countin that and these all twenties that s a g roll she say you re the worst you re the worst i cannot die because this my universe i don t really care if you cry on the real you shoulda never lied shoulda saw the way she looked me in my eyes she said baby i am not afraid to die push me to the edge all my friends are dead push me to the edge all my friends are dead push me to the edge all my friends are dead push me to the edge")
print("Top 5 Similar Indices:", I)
print("Similarity Scores:", D)

Top 5 Similar Indices: [[    0  9169 10696 18536  8237]]
Similarity Scores: [[1.         0.86578244 0.8650054  0.8633137  0.8606238 ]]


In [44]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
# Step 1: Sample 200 random rows from your dataframe
sample_df = df.sample(n=1000, random_state=31).reset_index(drop=True)

# Step 2: Create a results list


results = []

for i, row in tqdm(sample_df.iterrows(), total=sample_df.shape[0]):
    full_lyric = row["clean lyrics"]
    words = full_lyric.split()

    # Skip if lyric is too short
    if len(words) < 20:
        continue

    start_idx = random.randint(5, len(words) - 10 - 1)
    mid_lyric = " ".join(words[start_idx:start_idx + 10])

    I_1, D_1 = query_model_bge(mid_lyric)

    for rank, (score, idx) in enumerate(zip(D_1[0], I_1[0]), start=1):
        result = {
            "query_index": i,
            "query_snippet": mid_lyric,
            "original_full_lyric": full_lyric,
            "result_rank": rank,
            "result_index": idx,
            "result_lyric": df.iloc[idx]["clean lyrics"],
            "result_artist": df.iloc[idx].get("artist_name", "Unknown"),
            "similarity_score": float(score),
        }
        results.append(result)

results_df_bge = pd.DataFrame(results)
results_df_bge.to_csv("bge_sim.csv", index=False)


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:20<00:00, 49.05it/s]


In [46]:
def average_top_similarity(results_df):
    # Group by each query
    grouped = results_df.groupby("query_index")
    
    # For each group, take the top-ranked result (rank 1) and get its score
    top_scores = grouped.apply(lambda x: x.sort_values("result_rank").iloc[0]["similarity_score"])
    
    # Return the average
    return top_scores.mean()
avg_score = average_top_similarity(results_df_bge)
print(f"Average Top Similarity Score: {avg_score:.4f}")


Average Top Similarity Score: 0.7438
