In [2]:
!pip install torch
!pip install sentence-transformers
!pip install transformers
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install pinecone
!pip install pinecone-client
!pip install langchain


Collecting pinecone
  Downloading pinecone-5.4.2-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<4.0.0,>=2.0.0 (from pinecone)
  Downloading pinecone_plugin_inference-3.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-5.4.2-py3-none-any.whl (427 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.3/427.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-3.1.0-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.5/87.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone
Successfully installed pinecone-5.4.2 pinecone-plugin-inference-3.1.0 pinecone-plug

In [3]:
import numpy as np
import pandas as pd
import os
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics.pairwise import cosine_similarity
import math
import csv
import random


In [11]:
from pinecone import Pinecone, ServerlessSpec, Index

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_6u2fzi_RTrRihKCMgSwZ13NHQi3rhZu7kQvDQ86cx3niadZQGj6UL7nKZcDjCU571LeNfw")

# Define index name
index_name = "song-lyrics-index"

# Check if the index exists; create it if it doesn't
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Dimension of your embeddings
        metric="cosine",  # Similarity metric
        spec=ServerlessSpec(
            cloud="aws",  # Cloud provider
            region="us-east-1"  # Region
        )
    )

# Use the Index class to connect directly to the index
index = pc.Index(index_name)

print("Pinecone setup complete!")


Pinecone setup complete!


In [5]:
def load_lyrics_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

def retrieve_top_k_songs_pinecone(query, index, bi_encoder, k=10):
    query_embedding = bi_encoder.encode([query], convert_to_tensor=False)
    results = index.query(vector=query_embedding.tolist(), top_k=k * 2, include_metadata=True)  # Fetch more to account for duplicates

    # Use a set to keep track of unique songs
    unique_songs = {}
    for match in results.matches:
        key = (match["metadata"]["Title"], match["metadata"]["Artist"])  # Unique identifier
        if key not in unique_songs:
            unique_songs[key] = {
                "Title": match["metadata"]["Title"],
                "Artist": match["metadata"]["Artist"],
                "Lyric": match["metadata"]["Lyric"],
                "score": match.score,
            }

        # Stop once we have the top k unique songs
        if len(unique_songs) >= k:
            break

    # Sort by score and return top k unique songs
    return sorted(unique_songs.values(), key=lambda x: x["score"], reverse=True)


def preprocess_lyrics(lyrics, min_segment_size=3, max_segments=10):
    words = lyrics.split()
    total_words = len(words)

    if total_words <= min_segment_size:
        return [lyrics]

    segment_size = max(min_segment_size, math.ceil(total_words / max_segments))
    segments = [" ".join(words[i:i+segment_size]) for i in range(0, total_words, segment_size)]
    return segments

def create_finetuning_dataset(df, num_queries=2, num_negative_pairs=10, min_segment_size=3, max_segments=10, qrels_path="qrels.csv"):
    queries = []
    corpus = []

    with open(qrels_path, mode="w", newline="", encoding="utf-8") as qrels_file:
        qrels_writer = csv.DictWriter(qrels_file, fieldnames=["_query_id", "song_id", "score"])
        qrels_writer.writeheader()

        for idx, row in df.iterrows():
            title = row['Title']
            lyrics = row['Lyric']
            artist = row['Artist']

            # We split the lyrics into multiple parts and then randomly sample queries from it.
            segments = preprocess_lyrics(lyrics, min_segment_size, max_segments)
            corpus.append({"_id": f"{idx+1}", "Title": title, "lyrics": lyrics, "Artist": artist})

            selected_queries = random.sample(segments, min(len(segments), num_queries))
            for query in selected_queries:
                query_id = f"q{len(queries)+1}"
                queries.append({"_query_id": query_id, "query": query})

                # The song of origin for the specific query will have label 1 (meaning the query is relevant for that song).
                qrels_writer.writerow({"_query_id": query_id, "song_id": f"{idx+1}", "score": 1})
                # Due to size limitations, we randomly sample 100 songs to set the label to 0 (meaning the query is not relevant for that song).
                negative_song_indices = [i for i in range(len(df)) if i != idx]
                negative_samples = random.sample(negative_song_indices, num_negative_pairs)

                for neg_idx in negative_samples:
                    qrels_writer.writerow({"_query_id": query_id, "song_id": f"{neg_idx+1}", "score": 0})

    return queries, corpus

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_lyrics(lyrics, chunk_size=100, overlap=50):
    """
    Uses LangChain's RecursiveCharacterTextSplitter to chunk lyrics.

    Args:
        lyrics (str): The full lyrics as a single string.
        chunk_size (int): The maximum number of characters per chunk.
        overlap (int): The number of overlapping characters between chunks.

    Returns:
        List[str]: A list of lyric chunks.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=overlap
    )
    return splitter.split_text(lyrics)


In [7]:
def preprocess_and_store_embeddings(data, index, chunk_size=100, overlap=50, batch_size=100):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    bi_encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    rows = []

    for idx, row in data.iterrows():
        lyrics = row['Lyric']
        title = row['Title']
        artist = row['Artist']

        # Convert lyrics to string and handle potential NaN values
        lyrics = str(lyrics)  # Ensure lyrics is a string
        if lyrics.lower() == 'nan':
            continue

        # Use LangChain chunker
        chunks = splitter.split_text(lyrics)
        for i, chunk in enumerate(chunks):
            rows.append((f"{idx}-{i}", chunk, title, artist))

    for i in range(0, len(rows), batch_size):
        batch = rows[i:i+batch_size]

        # Extract chunks for embedding
        chunks = [row[1] for row in batch]
        embeddings = bi_encoder.encode(chunks, convert_to_tensor=False)

        # Prepare data for upsert
        vectors = []
        for (vector_id, chunk, title, artist), embedding in zip(batch, embeddings):
            metadata = {
                "Title": title,
                "Artist": artist,
                "Lyric": chunk
            }
            vectors.append((vector_id, embedding.tolist(), metadata))

        # Upsert the batch to Pinecone
        index.upsert(vectors)
        print(f"Upserted batch {i//batch_size + 1}/{(len(rows) + batch_size - 1) // batch_size}")

    print("Embeddings stored in Pinecone!")


In [8]:
class BiEncoder:
    def __init__(self, model_name='sentence-transformers/all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)

    def encode_texts(self, texts):
        """Used for encoding lyrics into embeddings."""
        return self.model.encode(texts, convert_to_tensor=True, show_progress_bar=True)

class CrossEncoder:
    def __init__(self, model_name='cross-encoder/ms-marco-MiniLM-L-6-v2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def rank_candidates(self, query, candidates):
        inputs = [
            self.tokenizer(query, candidate, return_tensors='pt', truncation=True, max_length=512, padding=True)
            for candidate in candidates
        ]
        scores = []
        for input_pair in inputs:
            with torch.no_grad():
                logits = self.model(**input_pair).logits
            scores.append(logits.item())
        ranked_indices = np.argsort(scores)[::-1]
        return ranked_indices

In [9]:
def song_retrieval_pipeline(query, index, bi_encoder_model='sentence-transformers/all-mpnet-base-v2', k=5):
    # Initialize Bi-Encoder
    bi_encoder = SentenceTransformer(bi_encoder_model)

    # Retrieve top-k unique songs using Pinecone
    top_k_songs = retrieve_top_k_songs_pinecone(query, index, bi_encoder, k)

    return top_k_songs


In [28]:
dataset_path = "/content/drive/MyDrive/Colab Notebooks/dataset.csv"
query = "I live for the applause"


# Precompute and store embeddings in Pinecone if not already stored
if not index.describe_index_stats()["total_vector_count"]:
  if os.path.exists(dataset_path):
    data = load_lyrics_dataset(dataset_path)
    preprocess_and_store_embeddings(data, index, chunk_size=100, overlap=50)  # Use chunking params
  else:
      print("Dataset path does not exist.")
else:
  print("Embeddings already exist in Pinecone.")

    # Run the song retrieval pipeline
results = song_retrieval_pipeline(query, index)
print("Top retrieved songs:")
for idx, song in enumerate(results):
  print(f"{idx + 1}. {song['Title']} by {song['Artist']}")


Dataset obtained.
Embeddings already exist in Pinecone.
Top retrieved songs:
1. Bad romance / Applause / Swine / Gyspy (artRAVE paris version) by Lady Gaga
2. Applause - Come To Mama - Edge Of Glory - Born This Way (ACT IV) by Lady Gaga
3. Applause / Come To Mama / The edge Of Glory / Born This Way by Lady Gaga
4. Applause (Demo) by Lady Gaga
5. Applause (Purity Ring Remix) by Lady Gaga
6. Applause (Empire of the Sun Remix) by Lady Gaga
7. Applause by Lady Gaga
