## Installing

In [None]:
!pip install pipx
!apt install python3.10-venv # it is required for whisper
!pipx install insanely-fast-whisper
!pip install pytubefix # alternativa pytube
!pip install kaggle
!pip install torch torchvision torchaudio
!pip install transformers torch faiss-cpu
!pip install openai-whisper

Collecting pipx
  Downloading pipx-1.7.1-py3-none-any.whl.metadata (18 kB)
Collecting argcomplete>=1.9.4 (from pipx)
  Downloading argcomplete-3.5.1-py3-none-any.whl.metadata (16 kB)
Collecting userpath!=1.9,>=1.6 (from pipx)
  Downloading userpath-1.9.2-py3-none-any.whl.metadata (3.0 kB)
Downloading pipx-1.7.1-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading argcomplete-3.5.1-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading userpath-1.9.2-py3-none-any.whl (9.1 kB)
Installing collected packages: userpath, argcomplete, pipx
Successfully installed argcomplete-3.5.1 pipx-1.7.1 userpath-1.9.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  python3-pip-whl python3-setu



# Development


### Import all the necessary libraries

In [None]:
from pytubefix import YouTube
import json
import subprocess
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import kagglehub
import os
from sklearn.preprocessing import normalize
import whisper
import torch
import torchvision
import torchaudio

  from tqdm.autonotebook import tqdm, trange


## Function to get lyrics from youtube video

A function able to extract lyrics as a string from a youtube url using `insanely-fast-whisper`.

In [None]:
def get_lyrics(youtube_url):
  """
    Extract the lyrics from a YouTube video using insanely-fast-whisper.

    Args:
        youtube_url: The URL of the YouTube video.

    Returns:
        A string with the letters transcribed, or None if an error occurs.
    """
  yt = YouTube(youtube_url)
  song = yt.streams.filter(only_audio=True).first()
  if song:
    path_song = song.download(filename="test_audio.mp3")

  # Transcription with insanely-fast-whisper. Capture output of the command with threads
  result = subprocess.run(['/root/.local/bin/insanely-fast-whisper', '--file-name',path_song], capture_output=True, text=True, check=True) # save the transcription on a json

  # Read the output json
  with open("/content/output.json", "r", encoding='utf-8') as f:
    json_data = f.read()
  data = json.loads(json_data)
  # print(data)
  lyrics = [chunk["text"] for chunk in data["chunks"]]

  os.remove(path_song) #remove temporary file
  os.remove("/content/output.json")

  return lyrics



## A function able to extract lyrics as a string from a youtube url using `whisper` form `openai-whisper library`

In [None]:
def get_lyrics_from_youtube_url(youtube_url):
  """
    Extract the lyrics from a YouTube video using openai-whisper.

    Args:
        youtube_url: The URL of the YouTube video.

    Returns:
        A string with the letters transcribed, or None if an error occurs.
    """
  model = whisper.load_model("base",device="cuda") #large-v2, large

  yt = YouTube(youtube_url)
  song = yt.streams.filter(only_audio=True).first()
  audio_path = song.download(filename="audio.mp3")

  lyrics = model.transcribe(audio_path)
  lyrics = [segment["text"] for segment in lyrics["segments"]] # divided in segments of sentences
  # lyrics = lyrics["text"]  # complete song in a text ---- With that type It got a dimension error

  os.remove(audio_path) #remove temporary file

  return lyrics


In [None]:
result = get_lyrics("https://youtu.be/TWX0SAh3T1I")

  checkpoint = torch.load(fp, map_location=device)


## Embeddings extractor

Function able to extract embeddings from a given text.


In [None]:
# To generate embeddings efficiently with large databases with batches
def extract_embeddings(texts):
  """
  Extracts embeddings for a list of texts, processing them in batches to improve performance.

  Args:
      texts: A list of texts.

  Returns:
      A NumPy array with the embeddings, or None if an error occurs.
  """

  # Load a pretrained Sentence Transformer model
  model = SentenceTransformer("all-MiniLM-L6-v2")
  embeddings = model.encode(texts)
  embeddings_norm = normalize(embeddings,"l2")
  return embeddings_norm


In [None]:
# Test with one sentence
text = "This is a test sentence."
embeddings = extract_embeddings([text])
embeddings.shape

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(1, 384)

## A vector database

Function able to create an index using `faiss`

Note that the input to `faiss` must be numpy arrays with proper shape, typically: `(num_items, embedding_dimension)`. For querying only one string, it might require `(1, embedding_dimension)`.



In [None]:
def createVDB(embeddings):
  """
    Create an index using Faiss (similarity search in high-dimensional vector spaces)

    Args:
        embeddings: NumPy matrix of embeddings.

    Returns:
        An index with embeddings using IndexFlatL2(eucledin distance) from faiss, or None if an error occurs.
  """
  dim = embeddings.shape[1]
  index = faiss.IndexFlatIP(dim)
  index.add(embeddings)

  return index

## Search within the vector database

Function able to to search the nearest neighbors from the database given a query string.

In [None]:
def searchTest(vectorDB,texts,embeddings,k):
  """
  Performs a nearest neighbor search and prints the results.

  Args:
      vectorDB: The Faiss index.
      texts: The original texts.
      embeddings: The query embeddings (NumPy array).
      k: The number of nearest neighbors to retrieve.
  """
    # Search nearest neighbors
  if embeddings is not None:
    # numpy array is required by Faiss but the embedding extraction return a numpy array
    D, I = vectorDB.search(embeddings, k)

    print(f"Query: {query_text}")
    for i in range(len(I[0])):
      print(f"Neighbor {i+1}: {texts[I[0][i]]}, Score: {D[0][i]*100:.1f}%")

In [None]:
def searchVDB(vectorDB,embeddings,titles,artists,k):
  result = []
    # Search nearest neighbors
  if embeddings is not None:
    # numpy array is required by Faiss but the embedding extraction return a numpy array
    D, I = vectorDB.search(embeddings, k)

    for i in range(len(I[0])):
      result.append({
        'title':titles[I[0][i]],
        'artist':artists[I[0][i]],
        'score': f"{D[0][i]*100:.1f}"
      })

    return result

## Test the Faiss index and search with a few embeddings

In [None]:
# A few text for embeddings
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A fox jumped over four dog.",
    "A fast brown fox leaps over a sleeping dog.",
    "The agile fox cleared the lazy canine.",
    "A dog was jumped over by a quick fox.",
    "The fox swiftly evaded the slumbering dog.",
    "A lazy dog slept while a quick fox jumped.",
    "A fox jumped over a dog.",
    "Cats are also very agile animals."
]

query_text = ["A fox jumped over a dog."]
k=3

query_embedding = extract_embeddings(query_text)
embeddings_list = extract_embeddings(texts)

In [None]:
# Test Performs nearest neighbor search calculations using Faiss.
vectorTestDB = createVDB(embeddings_list)

# Search nearest neighbors
searchTest(vectorTestDB,texts,query_embedding,k)

Query: ['A fox jumped over a dog.']
Neighbor 1: A fox jumped over a dog., Score: 100.0%
Neighbor 2: A dog was jumped over by a quick fox., Score: 90.6%
Neighbor 3: A fox jumped over four dog., Score: 84.3%


## Load and save lyrics database

From the databse in `song_lyrics.csv`, we will extract the top-1000 songs according to views. We will build our vector database with them.

Important: This file is huge, and does not fit in RAM so it has to be processed in batches




In [None]:
# Download latest version
path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information")
file_path = path+"/song_lyrics.csv"

Downloading from https://www.kaggle.com/api/v1/datasets/download/carlosgdcj/genius-song-lyrics-with-language-information?dataset_version_number=1...


100%|██████████| 3.04G/3.04G [00:57<00:00, 56.6MB/s]

Extracting files...





In [None]:
# With top 1000 and colums selection for optimization
def get_top_1000(file_path,chunksize=500000,top_n=1000):
  top_views_df = pd.DataFrame()
  # the function asum that the db has the columns like the csv example
  for chunk in pd.read_csv(file_path, chunksize=chunksize,usecols=['title', 'artist', 'lyrics','views']):
    chunk_top = chunk.nlargest(top_n, 'views')
    top_views_df = pd.concat([top_views_df, chunk_top])
    top_views_df = top_views_df.nlargest(top_n, 'views')

  top_views_df.to_csv('top_songs.csv', index=False) # Save data in CSV file
  return top_views_df

In [None]:
top_views_df = get_top_1000(file_path) # It is advisable to download file locally

## Extract embeddings for lyrics database

Extract embeddings for the 1000 lyrics in the database.

* We can use top_views_df or we can load the saved database if it is a new session without the in-memory dataframe, which would be useful for other sessions as well



In [None]:
# Loading the database
top_views_df = pd.read_csv('top_songs.csv')

In [None]:
# Extracting embedding
lyrics_embeddings = extract_embeddings(list(top_views_df['lyrics']))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


## Create a `faiss` index with lyrics

A `faiss` index with those 1000 lyrics, and test it with some example text.


In [None]:
# Example text
text = [
    "A song about heartbreak and loss",
    "Dancing in the moonlight",
    "Quiero acariciar tu cuerpo despacito, hasta que",
    "Lady Gaga",
    "Bohemian Rhapsody",
    "A joyful song about summer"
    ]

In [None]:
titles = list(top_views_df['title'])
artists = list(top_views_df['artist'])

In [None]:
vectorDB = createVDB(lyrics_embeddings)
for query in text:
  print("-------"+query+"-------")
  query_embedding = extract_embeddings([query])
  data = searchVDB(vectorDB,query_embedding,titles,artists,2)
  for item in data:
    print(f"Title: {item['title']}\nArtist: {item['artist']}\nScore: {item['score']}%\n")

-------A song about heartbreak and loss-------
Title: ​ghostin
Artist: Ariana Grande
Score: 53.5%

Title: Photograph
Artist: Ed Sheeran
Score: 49.2%

-------Dancing in the moonlight-------
Title: Moonlight
Artist: XXXTENTACION
Score: 41.9%

Title: BTS - 소우주 Mikrokosmos English Translation
Artist: Genius English Translations
Score: 39.5%

-------Quiero acariciar tu cuerpo despacito, hasta que-------
Title: Despacito
Artist: Luis Fonsi
Score: 55.3%

Title: Veleno 6
Artist: MadMan
Score: 48.0%

-------Lady Gaga-------
Title: Shallow
Artist: Lady Gaga & Bradley Cooper
Score: 45.7%

Title: Formation
Artist: Beyonc
Score: 37.9%

-------Bohemian Rhapsody-------
Title: Pound Cake / Paris Morton Music 2
Artist: Drake
Score: 40.5%

Title: ​i Album Version
Artist: Kendrick Lamar
Score: 39.5%

-------A joyful song about summer-------
Title: Happy
Artist: Pharrell Williams
Score: 48.7%

Title: Summertime Sadness
Artist: Lana Del Rey
Score: 47.4%




## A general function

Function that covers all of the above processes, able to download the dataset, extract the embeddings, create a vector database with them, also extract the embeddings of the song's lyrics from the query, and compare with those stored in the database to get the songs with similar lyrics

In [None]:
def get_covers(youtube_url, k):
  # Download a lyrics dataset
  top_views_df = pd.read_csv('top_songs.csv')

  # Extract embeddings for each song's lyrics
  lyrics_embeddings = extract_embeddings(list(top_views_df['lyrics']))

  # Create a vector index (database) for fast retrieval of similar lyrics
  vectorDB = createVDB(lyrics_embeddings)

  # Download the youtube video in a temporary file
  # And Transcribe the lyrics using the Whisper model
  lyrics = get_lyrics_from_youtube_url(youtube_url)

  # Extract the embeddings of the transcribed lyrics
  embeddings_song = extract_embeddings(lyrics)

  # Search the top-k similar entries in your vector database and return the song title and artist
  # Assuming there is a DB with the same named columns
  titles = list(top_views_df['title'])
  artists = list(top_views_df['artist'])
  covers = searchVDB(vectorDB,embeddings_song,titles,artists,k)

  return covers

## Evaluation

Note: For the entire evaluation of the system determining how correct the score is, it is necessary to have the answer or part of the answer, names of artists and songs that correspond to covers of each link, so that the calculation of evaluation metrics of the system can be done

In [None]:
all_covers = {}

urls=[
    "https://www.youtube.com/watch?v=BDC8Jr-gp_4",
    "https://www.youtube.com/watch?v=W_97b97G5ds",
    "https://www.youtube.com/watch?v=L53MZzuE0QY",
    "https://www.youtube.com/watch?v=9vmrPrYJPqI",
    "https://www.youtube.com/watch?v=R6ATpAr7rQU",
    "https://www.youtube.com/watch?v=RmtP8X4ZErs",
    "https://www.youtube.com/watch?v=DfMnRP0pk3A",
    "https://www.youtube.com/watch?v=1BVP72VrGQs"

]
for url in urls:
  covers = get_covers(url,2)
  all_covers[url] = covers

  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)


In [None]:
for url, covers_list in all_covers.items():
    print("-------" + url + "-------")
    for song in covers_list:
      print(f"Title: {song['title']}\nArtist: {song['artist']}\nScore: {song['score']}%")

-------https://www.youtube.com/watch?v=BDC8Jr-gp_4-------
Title: NO BYSTANDERS
Artist: Travis Scott
Score: 33.9%
Title: Shape of You
Artist: Ed Sheeran
Score: 32.9%
-------https://www.youtube.com/watch?v=W_97b97G5ds-------
Title: About Genius
Artist: Genius
Score: 48.7%
Title: Despacito Remix
Artist: Luis Fonsi & Daddy Yankee
Score: 32.7%
-------https://www.youtube.com/watch?v=L53MZzuE0QY-------
Title: Love Sosa
Artist: Chief Keef
Score: 33.0%
Title: Ms. Jackson
Artist: OutKast
Score: 31.6%
-------https://www.youtube.com/watch?v=9vmrPrYJPqI-------
Title: A Whole New World
Artist: Lea Salonga & Brad Kane
Score: 33.0%
Title: The Seven Ages of Man All the worlds a stage
Artist: William Shakespeare
Score: 30.9%
-------https://www.youtube.com/watch?v=R6ATpAr7rQU-------
Title: The Seven Ages of Man All the worlds a stage
Artist: William Shakespeare
Score: 34.9%
Title: Something Just Like This
Artist: The Chainsmokers & Coldplay
Score: 26.7%
-------https://www.youtube.com/watch?v=RmtP8X4ZErs-