In [13]:
!pip install torch
!pip install sentence-transformers
!pip install transformers
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install pinecone
!pip install pinecone-client
!pip install langchain



In [3]:
import numpy as np
import pandas as pd
import os
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics.pairwise import cosine_similarity
import math
import csv
import random
from openai import OpenAI


In [50]:
os.environ['OPENAI_API_KEY'] = 'api-key'

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [51]:
from pinecone import Pinecone, ServerlessSpec, Index

# Initialize Pinecone
pc = Pinecone(api_key="api-key")

index_name = "song-lyrics-index"

# Check if the index exists; create it if it doesn't
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Dimension of your embeddings
        metric="cosine",  # Similarity metric
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

print("Pinecone setup complete!")


Pinecone setup complete!


In [43]:

def load_lyrics_dataset(file_path):
    try:
        # Attempt to read the file with 'latin1' encoding
        df = pd.read_csv(file_path, encoding='latin1', on_bad_lines='skip')
    except Exception as e:
        print(f"Failed to read file: {e}")
        return None

    # Preprocessing
    # Remove rows with missing values in key columns
    df = df.dropna(subset=['Artist', 'Song', 'Lyrics'])
    # Remove rows where lyrics have fewer than 3 words
    df['lyrics_word_count'] = df['Lyrics'].apply(lambda x: len(str(x).split()))
    df = df[df['lyrics_word_count'] >= 3]

    df = df.drop(columns=['lyrics_word_count'])
    return df



# def retrieve_top_k_songs_pinecone(query, index, bi_encoder, k=10):
#     query_embedding = bi_encoder.encode([query], convert_to_tensor=False)
#     results = index.query(vector=query_embedding.tolist(), top_k=k * 2, include_metadata=True)  # Fetch more to account for duplicates

#     # Use a set to keep track of unique songs
#     unique_songs = {}
#     for match in results.matches:
#         key = (match["metadata"]["Song"], match["metadata"]["Artist"])  # Unique identifier
#         if key not in unique_songs:
#             unique_songs[key] = {
#                 "Song": match["metadata"]["Song"],
#                 "Artist": match["metadata"]["Artist"],
#                 "Lyric": match["metadata"]["Lyric"],
#                 "score": match.score,
#             }

#         # Stop once we have the top k unique songs
#         if len(unique_songs) >= k:
#             break

#     # Sort by score and return top k unique songs
#     return sorted(unique_songs.values(), key=lambda x: x["score"], reverse=True)


def preprocess_lyrics(lyrics, min_segment_size=3, max_segments=10):
    words = lyrics.split()
    total_words = len(words)

    if total_words <= min_segment_size:
        return [lyrics]

    segment_size = max(min_segment_size, math.ceil(total_words / max_segments))
    segments = [" ".join(words[i:i+segment_size]) for i in range(0, total_words, segment_size)]
    return segments

def create_finetuning_dataset(df, num_queries=2, num_negative_pairs=10, min_segment_size=3, max_segments=10, qrels_path="qrels.csv"):
    queries = []
    corpus = []

    with open(qrels_path, mode="w", newline="", encoding="utf-8") as qrels_file:
        qrels_writer = csv.DictWriter(qrels_file, fieldnames=["_query_id", "song_id", "score"])
        qrels_writer.writeheader()

        for idx, row in df.iterrows():
            Song = row['Song']
            lyrics = row['Lyrics']
            artist = row['Artist']

            # We split the lyrics into multiple parts and then randomly sample queries from it.
            segments = preprocess_lyrics(lyrics, min_segment_size, max_segments)
            corpus.append({"_id": f"{idx+1}", "Song": Song, "lyrics": lyrics, "Artist": artist})

            selected_queries = random.sample(segments, min(len(segments), num_queries))
            for query in selected_queries:
                query_id = f"q{len(queries)+1}"
                queries.append({"_query_id": query_id, "query": query})

                # The song of origin for the specific query will have label 1 (meaning the query is relevant for that song).
                qrels_writer.writerow({"_query_id": query_id, "song_id": f"{idx+1}", "score": 1})
                # Due to size limitations, we randomly sample 100 songs to set the label to 0 (meaning the query is not relevant for that song).
                negative_song_indices = [i for i in range(len(df)) if i != idx]
                negative_samples = random.sample(negative_song_indices, num_negative_pairs)

                for neg_idx in negative_samples:
                    qrels_writer.writerow({"_query_id": query_id, "song_id": f"{neg_idx+1}", "score": 0})

    return queries, corpus

In [44]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_lyrics(lyrics, chunk_size=100, overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=overlap
    )
    return splitter.split_text(lyrics)


In [45]:
def preprocess_and_store_embeddings(data, index, chunk_size=100, overlap=50, batch_size=100):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    bi_encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    rows = []

    for idx, row in data.iterrows():
        lyrics = row['Lyrics']
        Song = row['Song']
        artist = row['Artist']

        # Convert lyrics to string and handle potential NaN values
        lyrics = str(lyrics)  # Ensure lyrics is a string
        if lyrics.lower() == 'nan':
            continue

        # Use LangChain chunker
        chunks = splitter.split_text(lyrics)
        for i, chunk in enumerate(chunks):
            rows.append((f"{idx}-{i}", chunk, Song, artist))

    for i in range(0, len(rows), batch_size):
        batch = rows[i:i+batch_size]

        # Extract chunks for embedding
        chunks = [row[1] for row in batch]
        embeddings = bi_encoder.encode(chunks, convert_to_tensor=False)

        # Prepare data for upsert
        vectors = []
        for (vector_id, chunk, Song, artist), embedding in zip(batch, embeddings):
            metadata = {
                "Song": Song,
                "Artist": artist,
                "Lyrics": chunk
            }
            vectors.append((vector_id, embedding.tolist(), metadata))

        # Upsert the batch to Pinecone
        index.upsert(vectors)
        print(f"Upserted batch {i//batch_size + 1}/{(len(rows) + batch_size - 1) // batch_size}")

    print("Embeddings stored in Pinecone!")


In [46]:
class BiEncoder:
    def __init__(self, model_name='sentence-transformers/all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)

    def encode_texts(self, texts):
        """Used for encoding lyrics into embeddings."""
        return self.model.encode(texts, convert_to_tensor=True, show_progress_bar=True)

class CrossEncoder:
    def __init__(self, model_name='cross-encoder/ms-marco-MiniLM-L-6-v2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def rank_candidates(self, query, candidates):
        inputs = [
            self.tokenizer(query, candidate, return_tensors='pt', truncation=True, max_length=512, padding=True)
            for candidate in candidates
        ]
        scores = []
        for input_pair in inputs:
            with torch.no_grad():
                logits = self.model(**input_pair).logits
            scores.append(logits.item())
        ranked_indices = np.argsort(scores)[::-1]
        return ranked_indices

In [47]:
def refine_query_with_chatgpt(query):
    # Few-shot examples to guide the model
    examples = [
        {
            "role": "system",
            "content": "You are a helpful assistant that extracts and corrects the lyric portion from user input. Format output as:\nquery: <start_query>some_query<end_query>\nrefined_query: <start_refined_query>refined_query<end_refined_query>"
        },
        {
            "role": "user",
            "content": "Please correct any misspellings in the lyric portion of the following user input and remove any unrelated text. User input: 'Helllo fromm the othr siide (song lyric). Random text not part of lyrics.'"
        },
        {
            "role": "assistant",
            "content": "query: <start_query>Helllo fromm the othr siide<end_query>\nrefined_query: <start_refined_query>Hello from the other side<end_refined_query>"
        },
        {
            "role": "user",
            "content": "Please correct any misspellings in the lyric portion of the following user input and remove any unrelated text. User input: 'Twinkl twinkl litl star how I wondr wht u ar'"
        },
        {
            "role": "assistant",
            "content": "query: <start_query>Twinkl twinkl litl star how I wondr wht u ar<end_query>\nrefined_query: <start_refined_query>Twinkle twinkle little star how I wonder what you are<end_refined_query>"
        }
    ]

    # Main query to process
    examples.append(
        {
            "role": "user",
            "content": (
                f"Please correct any misspellings in the lyric portion of "
                f"the following user input and remove any unrelated text. "
                f"User input: {query}"
            )
        }
    )

    # Call the API
    chat_completion = client.chat.completions.create(
        messages=examples,
        model="gpt-4o",
    )

    # Extract and format the result
    output = chat_completion.choices[0].message.content.strip()

    # Manually parse the refined query from the output
    start_tag = "<start_refined_query>"
    end_tag = "<end_refined_query>"
    refined_query = ""
    if start_tag in output and end_tag in output:
        refined_query = output.split(start_tag)[-1].split(end_tag)[0].strip()

    print("Refined query:\n", refined_query)
    return refined_query


In [60]:
def song_retrieval_pipeline(query, index, bi_encoder_model='sentence-transformers/all-mpnet-base-v2',
                            cross_encoder_model='cross-encoder/ms-marco-MiniLM-L-6-v2', k=5):
    # Bi-Encoder
    bi_encoder = BiEncoder(bi_encoder_model)

    refined_query = refine_query_with_chatgpt(query)
    query_embedding = bi_encoder.encode_texts([refined_query])[0].tolist()

    # Query Pinecone
    results = index.query(vector=query_embedding, top_k=k * 2, include_metadata=True)

    # Initial retrieval with duplicate removal
    candidates = []
    seen_songs = set()

    for match in results["matches"]:
        song_key = (match["metadata"]["Song"], match["metadata"]["Artist"])
        if song_key not in seen_songs:
            seen_songs.add(song_key)
            candidates.append({
                "track_name": match["metadata"]["Song"],
                "artist_name": match["metadata"]["Artist"],
                "lyrics_chunk": match["metadata"]["Lyrics"],
                "score": match.score
            })
            if len(candidates) >= k:  # Stop if we have enough unique songs
                break

    lyrics_chunks = [candidate["lyrics_chunk"] for candidate in candidates]

    # Cross-Encoder
    cross_encoder = CrossEncoder(model_name=cross_encoder_model)
    ranked_indices = cross_encoder.rank_candidates(query, lyrics_chunks)

    # Re-ranking
    re_ranked_songs = [candidates[i] for i in ranked_indices[:k]]
    return re_ranked_songs


#song_retrieval_pipeline("something about an applouse", index)


Refined query:
 something about an applause


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[{'track_name': 'superstar',
  'artist_name': 'lupe fiasco featuring matthew santos',
  'lyrics_chunk': 'everybody claps cause everybody is pleased and then they all take the stage and start performin for',
  'score': 0.566725552},
 {'track_name': 'applause',
  'artist_name': 'lady gaga',
  'lyrics_chunk': 'applause i live for the applauseplause live for the applauseplause live for the way that you cheer',
  'score': 0.612568259},
 {'track_name': 'take a bow',
  'artist_name': 'rihanna',
  'lyrics_chunk': 'of applause standin ovationbut you put on quite a show you really had me goin but now its time to',
  'score': 0.637156844},
 {'track_name': 'i write sins not tragedies',
  'artist_name': 'panic at the disco',
  'lyrics_chunk': 'help but to hear an exchanging of wordswhat a beautiful wedding what a beautiful wedding says a',
  'score': 0.550227821}]

In [59]:
# Main program
dataset_path = "/content/drive/MyDrive/Colab Notebooks/dataset.csv"
corpus_path = "/content/drive/MyDrive/Colab Notebooks/corpus.csv"
queries_path = "/content/drive/MyDrive/Colab Notebooks/queries.csv"
qrels_path = "/content/drive/MyDrive/Colab Notebooks/qrels.csv"
query = "songs about broken heart"

if os.path.exists(dataset_path):
    print("Dataset obtained.")

    data = load_lyrics_dataset(dataset_path)

    if not (os.path.exists(corpus_path) and os.path.exists(queries_path) and os.path.exists(qrels_path)):
        print("Required files not found. Generating fine-tuning datasets...")

        # Generate fine-tuning datasets
        queries, corpus = create_finetuning_dataset(data, num_queries=5, num_negative_pairs=100, qrels_path=qrels_path)

        # Save queries and corpus to CSV
        queries_df = pd.DataFrame(queries)
        corpus_df = pd.DataFrame(corpus)
        queries_df.to_csv(queries_path, index=False)
        corpus_df.to_csv(corpus_path, index=False)
        print("Datasets generated and saved.")
    else:
        print("Datasets already exist. Skipping dataset generation.")

    # Check the database
    if not index.describe_index_stats()["total_vector_count"]:
        preprocess_and_store_embeddings(data, index, chunk_size=100, overlap=50)  # Use chunking params
    else:
        print("Embeddings already exist in Pinecone.")

    results = song_retrieval_pipeline(query, index)
    print("Top retrieved songs:")
    for idx, song in enumerate(results):
        print(f"{idx + 1}. {song['track_name']} by {song['artist_name']}")
        print(f"Lyrics Chunk: {song['lyrics_chunk']}\n")
else:
    print("Dataset path does not exist.")



Dataset obtained.
Datasets already exist. Skipping dataset generation.
Embeddings already exist in Pinecone.
Refined query:
 songs about broken heart


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Top retrieved songs:
1. it was almost like a song by ronnie milsap
Lyrics Chunk: my broken heart cries for you each night and its almost like a song but its much too sad to

2. broken hearted me by anne murray
Lyrics Chunk: hear this song i hope that you will see that time wont heal a brokenhearted me

3. therell be sad songs to make you cry by billy ocean
Lyrics Chunk: metherell be sad songs to make you cry love songs often do they can touch the heart of someone new

4. brokenhearted by brandy featuring wanya morris
Lyrics Chunk: brokenhearted lifes not over i can start again while im lonely brokenhearted its a hurting thing to

5. hey wont you play another somebody done somebody wrong song by bj thomas
Lyrics Chunk: a love thas gone wrong cause i dont wanna cry all alonehey wont you play another somebody done

