<a href="https://colab.research.google.com/github/FlanagG21/DCC-Capstone/blob/main/Genius_API_Connection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Single Song Recommendation

This file is for finding reccommendations for a specific song. In order to run the program, you will need to get an api key from kaggle (https://www.youtube.com/watch?v=s9O6soJES74) and upload it to Colab. Cells at the top only need to be run once per session.

In [42]:
#mount to drive
from google.colab import drive
drive.mount('/content/drive')
root = '/content/drive/MyDrive/DCC Project/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
#DIRECT FROM KAGGLE DB
!pip install opendatasets
import opendatasets as od
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F



In [44]:
movies_df = pd.read_parquet(os.path.join(root, 'movies_embeddings.parquet'), engine='pyarrow')
movies_df.head()

Unnamed: 0,id,title,overview,genres,embedding
0,27205,Inception,Cobb a skilled thief who commits corporate esp...,"Action, Science Fiction, Adventure","[0.01589059643447399, 0.11273891478776932, -0...."
1,157336,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction","[0.037922028452157974, -0.005655079614371061, ..."
2,155,The Dark Knight,Batman raises the stakes in his war on crime W...,"Drama, Action, Crime, Thriller","[0.011266704648733139, 0.032755907624959946, -..."
3,19995,Avatar,In the 22nd century a paraplegic Marine is dis...,"Action, Adventure, Fantasy, Science Fiction","[0.01744804158806801, 0.03436880186200142, 0.0..."
4,24428,The Avengers,When an unexpected enemy emerges and threatens...,"Science Fiction, Action, Adventure","[0.027801260352134705, -0.019952325150370598, ..."


In [45]:
#in order to download the songs db, you'll need to get a kaggle.json file by getting an api key from your kaggle account

od.download("https://www.kaggle.com/datasets/carlosgdcj/genius-song-lyrics-with-language-information")

Skipping, found downloaded files in "./genius-song-lyrics-with-language-information" (use force=True to force download)


In [46]:
songs = pd.read_csv("/content/genius-song-lyrics-with-language-information/song_lyrics.csv")
songs.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [47]:
import re
def clean_lyrics(lyrics):
    # Remove section tags like [Intro], [Verse 1], etc.
    cleaned = re.sub(r'\[.*?\]', '', lyrics)

    # Remove credits or text after "---"
    cleaned = re.split(r'---', cleaned)[0]

    # Remove symbols except for line breaks (\n) and alphanumeric characters
    cleaned = re.sub(r'[^\w\s\n]', '', cleaned)

    # Remove extra whitespace and blank lines
    cleaned = re.sub(r'\n\s*\n', '\n', cleaned).strip()

    return cleaned

In [48]:
# Set up device for GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [49]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [50]:
# Load model from HuggingFace Hub and move to gpu
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = torch.nn.DataParallel(model)
model.to(device)
model.eval()  # Set model to evaluation mode

DataParallel(
  (module): MPNetModel(
    (embeddings): MPNetEmbeddings(
      (word_embeddings): Embedding(30527, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): MPNetEncoder(
      (layer): ModuleList(
        (0-11): 12 x MPNetLayer(
          (attention): MPNetAttention(
            (attn): MPNetSelfAttention(
              (q): Linear(in_features=768, out_features=768, bias=True)
              (k): Linear(in_features=768, out_features=768, bias=True)
              (v): Linear(in_features=768, out_features=768, bias=True)
              (o): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
      

In [51]:
# Function to compute embedding
def compute_embedding(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
    # Convert to list for Parquet compatibility
    return sentence_embedding.squeeze().numpy().tolist()

In [52]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(query_song_embedding, movies_df, top_k=5):
    # Convert movie embeddings to an array
    movie_embeddings = np.vstack(movies_df['embedding'].values)

    # Compute cosine similarity between the query song and all movie embeddings
    similarities = cosine_similarity([query_song_embedding], movie_embeddings)[0]

    # Get the indices of the top_k most similar movies
    top_indices = np.argsort(similarities)[::-1][:top_k]

    # Return the recommended movies with their similarity scores
    recommended_movies = movies_df.iloc[top_indices].copy()
    recommended_movies['similarity'] = similarities[top_indices]

    return recommended_movies

RUN FROM HERE IF YOU JUST WANT TO CHANGE THE INPUT SONG:

In [53]:
#USER INPUT
title = "Souvenir"
artist = "Selena Gomez"

song = songs[(songs['title'] == title) & (songs['artist'] == artist)]
if not song.empty:
    print("Song found:")
    print(song)
else:
    print("Song not found.")

Song found:
            title  tag        artist  year   views features  \
3491193  Souvenir  pop  Selena Gomez  2020  272037       {}   

                                                    lyrics       id  \
3491193  [Intro]\nChills\n\n[Verse 1]\nNew York back in...  5279050   

        language_cld3 language_ft language  
3491193            en          en       en  


In [54]:
# Apply the cleaning function
columns_to_drop = ['artist', 'views', 'features', 'language', 'language_cld3', 'language_ft', 'id','year']
song_clean = song.drop(columns=columns_to_drop)
song_clean.head()
song_clean['lyrics'] = song_clean['lyrics'].apply(clean_lyrics)
print(song_clean['lyrics'])

3491193    Chills\nNew York back in August tenth floor ba...
Name: lyrics, dtype: object


In [55]:
#get embedding
song_clean['embedding'] = song_clean['lyrics'].apply(compute_embedding)
print(song_clean)

            title  tag                                             lyrics  \
3491193  Souvenir  pop  Chills\nNew York back in August tenth floor ba...   

                                                 embedding  
3491193  [-0.02880687266588211, 0.0381150059401989, 0.0...  


In [56]:
song_embedding = np.array(song_clean['embedding'].iloc[0])
recommendations = recommend_movies(song_embedding, movies_df)
print(f"Recommendations for song '{title}' by {artist}':")
print(recommendations[['title', 'similarity']])

Recommendations for song 'Souvenir' by Selena Gomez':
                                                    title  similarity
128929       The White Stripes: Under Nova Scotian Lights    0.594400
440448                  George Benson - Give me the night    0.575736
477660        Dream Theater: [1999] Triport Rock Festival    0.573916
320506  Bryan Adams - Live from the Royal Albert Hall ...    0.569729
469125         Bruce Springsteen - Paris Bercy 05/07/2012    0.563538
