In [1]:
from datasets import load_dataset
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


In [2]:
def read_data():
    ds = load_dataset("brunokreiner/genius-lyrics")
    return ds

In [3]:
new_ds = read_data()

In [7]:
def load_training_testing_into_panda(ds):    
    ds_df = ds["train"].to_pandas()
    new_ds_df = ds_df[["lyrics","artist_name"]]
    new_ds_df.dropna(subset=["lyrics","artist_name"], inplace=True)
    return new_ds_df

In [9]:
df = load_training_testing_into_panda(new_ds)
df.head()
len(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_ds_df.dropna(subset=["lyrics","artist_name"], inplace=True)


49985

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)  # remove [Verse], [Chorus]
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

In [13]:
df["clean lyrics"] = df["lyrics"].apply(clean_text)
len(df)

49985

In [14]:
def number_of_entries(df, number):
    df_num = df.head(number).copy()
    return df_num
df_25k = number_of_entries(df, 25000)


In [17]:
len(df_25k)

25000

In [25]:
df_25k.reset_index(inplace=True)

df_25k

Unnamed: 0,index,lyrics,artist_name,clean lyrics
0,2,are you alright i m alright i m quite alright ...,Lil Uzi Vert,are you alright i m alright i m quite alright ...
1,6,we pick up in new york city i m tryna find a m...,John Mayer,we pick up in new york city i m tryna find a m...
2,24,baby you a song you make me wanna roll my wind...,Florida Georgia Line,baby you a song you make me wanna roll my wind...
3,26,baby last night was hands down one of the best...,Thomas Rhett,baby last night was hands down one of the best...
4,30,you make me feel so young you make me feel as ...,Michael Bublé,you make me feel so young you make me feel as ...
...,...,...,...,...
24995,240248,ronnie talk to russia before its too late befo...,Prince,ronnie talk to russia before its too late befo...
24996,240251,it took five women to get you off of my mind i...,Prince,it took five women to get you off of my mind i...
24997,240252,sex temptation lust pop go mama everybody on t...,Prince,sex temptation lust pop go mama everybody on t...
24998,240260,lay down your funky weapon yeah y all here we ...,Prince,lay down your funky weapon yeah y all here we ...


In [31]:
df_25k.drop(columns=['clean lyrics'], inplace = True)
df_25k.head()

Unnamed: 0,lyrics,artist_name
0,are you alright i m alright i m quite alright ...,Lil Uzi Vert
1,we pick up in new york city i m tryna find a m...,John Mayer
2,baby you a song you make me wanna roll my wind...,Florida Georgia Line
3,baby last night was hands down one of the best...,Thomas Rhett
4,you make me feel so young you make me feel as ...,Michael Bublé


In [33]:
df_25k.to_parquet("df_25k.parquet")

In [21]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [27]:
model_bge = SentenceTransformer("BAAI/bge-small-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [37]:
df_25k_bge = df_25k[["clean lyrics"]]
df_25k_bge.head()

Unnamed: 0,clean lyrics
2,are you alright i m alright i m quite alright ...
6,we pick up in new york city i m tryna find a m...
24,baby you a song you make me wanna roll my wind...
26,baby last night was hands down one of the best...
30,you make me feel so young you make me feel as ...


In [39]:
df_25k_bge.reset_index(drop=True, inplace=True)
df_25k_bge.head()

Unnamed: 0,clean lyrics
0,are you alright i m alright i m quite alright ...
1,we pick up in new york city i m tryna find a m...
2,baby you a song you make me wanna roll my wind...
3,baby last night was hands down one of the best...
4,you make me feel so young you make me feel as ...


In [41]:
def embed_bge(df):
    texts = df["clean lyrics"].tolist()
    embeddings = model_bge.encode(texts, batch_size=32, show_progress_bar=True)
    df["embedded"] = embeddings.tolist()
    return df


In [43]:
df_vector_bge = embed_bge(df_25k_bge)
df_vector_bge.head()

Batches:   0%|          | 0/782 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["embedded"] = embeddings.tolist()


Unnamed: 0,clean lyrics,embedded
0,are you alright i m alright i m quite alright ...,"[-0.09510867297649384, -0.052341993898153305, ..."
1,we pick up in new york city i m tryna find a m...,"[-0.020572615787386894, -0.07993195205926895, ..."
2,baby you a song you make me wanna roll my wind...,"[-0.027859799563884735, -0.03636902943253517, ..."
3,baby last night was hands down one of the best...,"[-0.02230849675834179, -0.0188309233635664, 0...."
4,you make me feel so young you make me feel as ...,"[-0.03834763541817665, -0.03280093893408775, 0..."


In [45]:
df_vector_bge.to_parquet("vscode_vectors_bge.parquet", index=False)
df_vector_bge.to_csv("vscode_vectors_bge.csv", index=False, encoding="utf-8")


In [47]:
array = np.vstack(df_vector_bge["embedded"].values).astype("float32")
array

array([[-9.5108673e-02, -5.2341994e-02,  2.9881362e-02, ...,
         4.3037690e-02, -1.4177671e-02, -2.1796750e-02],
       [-2.0572616e-02, -7.9931952e-02,  4.0240899e-02, ...,
         1.6816258e-02, -7.6525152e-02, -1.5770495e-02],
       [-2.7859800e-02, -3.6369029e-02,  7.4195705e-02, ...,
         3.5594774e-03, -3.9827251e-03,  1.5726753e-02],
       ...,
       [-1.1820935e-04,  1.6659580e-02,  2.2597142e-02, ...,
         8.7751560e-02, -9.5417341e-03, -2.7034365e-02],
       [-7.4566141e-02, -1.4757029e-02,  3.7393596e-02, ...,
         3.9951306e-02,  2.7222736e-03, -4.2417862e-02],
       [-3.5246432e-02, -2.8395848e-02,  2.8976936e-02, ...,
         1.2137246e-01, -4.1237038e-02, -1.7751783e-02]], dtype=float32)

In [49]:
norms = np.linalg.norm(array, axis=1)
all_normalized = np.allclose(norms, 1.0, atol=1e-6)  # Tolerance of 1e-6
print("All vectors normalized:", all_normalized)


All vectors normalized: True


In [51]:
dim = array.shape[1]
dim

384

In [53]:
index = faiss.IndexHNSWFlat(dim, 48, faiss.METRIC_INNER_PRODUCT)
index

<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x000002459BFF64F0> >

In [57]:
index.hnsw.efConstruction = 500
index.hnsw.efSearch = 64


In [59]:
index.add(array)  
index.ntotal

25000

In [67]:
def query_model_bge(text):
    clean_text(text)
    query = model_bge.encode(text, convert_to_numpy=True)
    D, I = index.search(np.expand_dims(query, axis=0), k=5)
    return I, D

In [69]:
I, D = query_model_bge("are you alright i m alright i m quite alright and my money s right countin them bands all way to the top til they be fallin over countin them bands on my way to the top til we fallin over i don t really care if you cry on the real you shoulda never lied shoulda saw the way she looked me in my eyes she said baby i am not afraid to die push me to the edge all my friends are dead push me to the edge all my friends are dead push me to the edge all my friends are dead push me to the edge phantom that s all red inside all white like somethin you ride a sled down i just want that head my brittany got mad i m barely her man now everybody got the same swag now watch the way that i tear it down stackin my bands all the way to the top all the way til my bands fallin over every time that you leave your spot your girlfriend call me like come on over i like the way that she treat me gon leave you won t leave me i call it that casanova she say i m insane yeah i might blow my brain out xanny help the pain yeah please xanny make it go away i m committed not addicted but it keep control of me all the pain now i can t feel it i swear that it s slowin me yeah i don t really care if you cry on the real you shoulda never lied saw the way she looked me in my eyes she said i am not afraid to die all my friends are dead push me to the edge all my friends are dead yeah ooh push me to the edge all my friends are dead yeah all my friends are dead yeah that is not your swag i swear you fake hard now these niggas wanna take my cadence rain on em thunderstorm rain on em medicine lil nigga take some fast car nascar race on em in the club ain t got no ones then we would beg them clothes from overseas got the racks and they all c notes you is not a g though lookin at you stackin all your money it all green though i was countin that and these all twenties that s a g roll she say you re the worst you re the worst i cannot die because this my universe i don t really care if you cry on the real you shoulda never lied shoulda saw the way she looked me in my eyes she said baby i am not afraid to die push me to the edge all my friends are dead push me to the edge all my friends are dead push me to the edge all my friends are dead push me to the edge")
print("Top 5 Similar Indices:", I)
print("Similarity Scores:", D)

Top 5 Similar Indices: [[    0  9169 10696 18536  8237]]
Similarity Scores: [[1.         0.86578244 0.8650054  0.8633137  0.8606238 ]]


In [73]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
# Step 1: Sample 200 random rows from your dataframe
sample_df = df_vector_bge.sample(n=500, random_state=36).reset_index(drop=True)

# Step 2: Create a results list


results = []

for i, row in tqdm(sample_df.iterrows(), total=sample_df.shape[0]):
    full_lyric = row["clean lyrics"]
    words = full_lyric.split()

    # Skip if lyric is too short
    if len(words) < 20:
        continue

    # Random 15-word segment from the middle
    start_idx = random.randint(5, len(words) - 15 - 1)
    mid_lyric = " ".join(words[start_idx:start_idx + 15])

    I_1, D_1 = query_model_bge(mid_lyric)

    for rank, (score, idx) in enumerate(zip(D_1[0], I_1[0]), start=1):
        result = {
            "query_index": i,
            "query_snippet": mid_lyric,
            "original_full_lyric": full_lyric,
            "result_rank": rank,
            "result_index": idx,
            "result_lyric": df_vector_bge.iloc[idx]["clean lyrics"],
            "result_artist": df_vector_bge.iloc[idx].get("artist_name", "Unknown"),
            "similarity_score": float(score),
        }
        results.append(result)

results_df_bge = pd.DataFrame(results)
results_df_bge.to_csv("bge_middle_snippet_similarity_results_bge.csv", index=False)


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [00:14<00:00, 33.90it/s]


In [75]:
def average_top_similarity(results_df):
    # Group by each query
    grouped = results_df.groupby("query_index")
    
    # For each group, take the top-ranked result (rank 1) and get its score
    top_scores = grouped.apply(lambda x: x.sort_values("result_rank").iloc[0]["similarity_score"])
    
    # Return the average
    return top_scores.mean()
avg_score = average_top_similarity(results_df_bge)
print(f"Average Top Similarity Score: {avg_score:.4f}")


Average Top Similarity Score: 0.7554


In [39]:
def embed(df):
    texts = df["clean lyrics"].tolist()
    embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
    df["embedded"] = embeddings.tolist()
    return df


In [43]:
df_vector.to_parquet("vscode_vectors.parquet", index=False)


In [47]:
df_vector.to_csv("vscode_vectors.csv", index=False, encoding="utf-8")


In [None]:
#Inconsolata

In [266]:
array = np.vstack(df_vector["embedded"].values).astype("float32")
array

array([[-0.02152613, -0.1130375 ,  0.00474123, ...,  0.00775055,
         0.02242729, -0.06069924],
       [-0.03519551, -0.09525009,  0.03990062, ..., -0.05164747,
        -0.05297656, -0.09399048],
       [-0.1212237 , -0.04789452, -0.0203167 , ...,  0.06081615,
         0.02501846, -0.07551719],
       ...,
       [-0.07979702, -0.16962808,  0.05023401, ..., -0.02455066,
         0.01703537, -0.02705456],
       [-0.00831627, -0.02866414,  0.0432986 , ..., -0.0284607 ,
         0.01383475, -0.08256452],
       [-0.06405386, -0.06251999,  0.02244229, ...,  0.05117697,
        -0.09302013, -0.06531898]], dtype=float32)

In [277]:
norms = np.linalg.norm(array, axis=1)
all_normalized = np.allclose(norms, 1.0, atol=1e-6)  # Tolerance of 1e-6
print("All vectors normalized:", all_normalized)


All vectors normalized: True


In [279]:
dim = array.shape[1]
dim

384

In [323]:
index = faiss.IndexHNSWFlat(dim, 48, faiss.METRIC_INNER_PRODUCT)
index

<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x0000020A5FE78150> >

In [325]:
index.hnsw.efConstruction = 500
index.hnsw.efSearch = 64


In [327]:
index.add(array)  


In [328]:
index.ntotal

25000

In [332]:
def query_model(text):
    clean_text(text)
    query = model.encode(text, convert_to_numpy=True)
    D, I = index.search(np.expand_dims(query, axis=0), k=5)
    return I, D

In [317]:
I, D = query_model("gals hes a fine fine thang lord knows he sure is fine gals i cant help myself i m going to do it til the cows come home and when my goose gets loose she s gonna know i m still be getting it on with him around now im gonna move it slow like a mule im gonna love him funky free and foolish im gonna do my best and try hard to get him just take a look at that child hes enough to drive a poor gal wild take him in check him out im gonna im gonna shoo b doop all night im gonna try him out until the sunrise and when clock strikes twelve gals hes a fine fine thang lord knows he sure is fine gals i cant help myself  yes im gonna do it im gonna do it til the cows come home im gonna do it til the chicken croaks im gonna move it slow like a mule im gonna do it yes im gonna do it gals hes a fine fine thang lord knows lord knows that boy hes sure enough fine")
print("Top 5 Similar Indices:", I)
print("Similarity Scores:", D)

Top 5 Similar Indices: [[13254 21182 23699   453 23032]]
Similarity Scores: [[1.         0.62504894 0.62033015 0.61631763 0.61248696]]


In [297]:
df_25k.reset_index(drop=True, inplace=True)


In [299]:
df_25k.iloc[I[0][0]]

lyrics          ooh ooh ooh keep on truckin baby i got to keep...
artist_name                                       Eddie Kendricks
clean lyrics    ooh ooh ooh keep on truckin baby i got to keep...
embedded        [-0.12439936399459839, -0.13273802399635315, 0...
Name: 12750, dtype: object

In [309]:
df_25k.iloc[12750]
df_vector.iloc[12750]

lyrics          ooh ooh ooh keep on truckin baby i got to keep...
artist_name                                       Eddie Kendricks
clean lyrics    ooh ooh ooh keep on truckin baby i got to keep...
embedded        [-0.12439936399459839, -0.13273802399635315, 0...
Name: 12750, dtype: object

In [334]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
# Step 1: Sample 200 random rows from your dataframe
sample_df = df_vector.sample(n=500, random_state=36).reset_index(drop=True)

# Step 2: Create a results list


results = []

for i, row in tqdm(sample_df.iterrows(), total=sample_df.shape[0]):
    full_lyric = row["clean lyrics"]
    words = full_lyric.split()

    if len(words) < 20:
        continue

    start_idx = random.randint(5, len(words) - 15 - 1)
    mid_lyric = " ".join(words[start_idx:start_idx + 15])

    I_1, D_1 = query_model(mid_lyric)

    for rank, (score, idx) in enumerate(zip(D_1[0], I_1[0]), start=1):
        result = {
            "query_index": i,
            "query_snippet": mid_lyric,
            "original_full_lyric": full_lyric,
            "result_rank": rank,
            "result_index": idx,
            "result_lyric": df_vector.iloc[idx]["clean lyrics"],
            "result_artist": df_vector.iloc[idx].get("artist_name", "Unknown"),
            "similarity_score": float(score),
        }
        results.append(result)

results_df = pd.DataFrame(results)
results_df.to_csv("middle_snippet_similarity_results.csv", index=False)


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [00:10<00:00, 46.67it/s]


In [336]:
def average_top_similarity(results_df):
    
    grouped = results_df.groupby("query_index")
    
    top_scores = grouped.apply(lambda x: x.sort_values("result_rank").iloc[0]["similarity_score"])
    
    return top_scores.mean()
avg_score = average_top_similarity(results_df)
print(f"Average Top Similarity Score: {avg_score:.4f}")


Average Top Similarity Score: 0.5368
