In [4]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
%load_ext autoreload
%autoreload 2

In [5]:
import os
import sys

# Appends the entire brainstation_capstone project folder to the path.
# This allows us to make a relative import of our scripts in brainstation_capstone/scripts
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utilities import utils
from utilities.vectorizer_pipeline import VectorizerPipeline

In [6]:
DATA_PATH = utils.get_datapath('data')

# **Transforming Lyrics** 

This notebook is the preliminary work of vectorizing the data. A class was created to vectorize a train set and transform the validation and test set. This class would then store transformed train, validation and test set for preliminary modeling. The ideology for this method was to stay true to the Agile methodology, where we will try to try as many vectorizer combinations and narrow down one to tune a model for after.

Specifically, we will look at the following transformations:
- CountVectorizer (N-grams = 1, 2 and 3)
- TF-IDF
- Averaging LexVec Embeddings
- OpenAI Ada Embeddings

In [4]:
df = pd.read_csv(DATA_PATH / 'clean_lyrics.csv')

In [12]:
display(df.head())
df.shape

Unnamed: 0.1,Unnamed: 0,song,lyrics,views,cleaned_lyrics,log_scaled_views,popular,popularity_rating
0,0,Kendrick-lamar-swimming-pools-drank-lyrics,\n\n[Produced by T-Minus]\n\n[Intro]\nPour up ...,5589280.0,pour up drank head shot drank sit down drank ...,15.536361,1,2
1,1,Kendrick-lamar-money-trees-lyrics,\n\n[Produced by DJ Dahi]\n\n[Verse 1: Kendric...,4592003.0,uh me and my niggas tryna get it ya bish ya b...,15.339827,1,2
2,2,Kendrick-lamar-xxx-lyrics,"\n\n[Intro: Bēkon & Kid Capri]\nAmerica, God b...",4651514.0,america god bless you if its good to you amer...,15.352703,1,2
3,3,A-ap-rocky-fuckin-problems-lyrics,"\n\n[Chorus: 2 Chainz, Drake & Both (A$AP Rock...",7378309.0,i love bad bitches thats my fuckin problem an...,15.814055,1,2
4,4,Kendrick-lamar-dna-lyrics,"\n\n[Verse 1]\nI got, I got, I got, I got—\nLo...",5113687.0,i got i got i got i got loyalty got royalty i...,15.447431,1,2


(37905, 8)

After loading the dataset we stem the words and we also prepare the english stop words. 

In [14]:
import nltk

# Initialize stemmer.
stemmer = nltk.stem.PorterStemmer()

In [5]:
from sklearn.feature_extraction import text 

stop_words = list(text.ENGLISH_STOP_WORDS.union(['im','nigga', 'anywh', 'becau', 'el', 'elsewh', 'everywh', 'ind', 'otherwi', 'plea', 'somewh']))

We also need to stem the stop words so that they will get recognized during the vectorization process.

In [24]:
# Stem the stop words so that they will be removed when stop words in the original are stemmed. 
stop_words = [stemmer.stem(i) for i in stop_words]


# Vectorizing Lyrics for Classification

Here we prepare a vectorizer pipeline for a CountVectorizer with varying lengths of n_grams, along with TF-IDF. We do this for both the binary and multi-class problem.

In [16]:
y_popular = df.popular
y_popularity = df.popularity_rating

y_popular.shape, y_popularity.shape

((37905,), (37905,))

In [18]:
for vectorizer_name in [
    'bag_of_words_two_class', 'tf_idf_two_class', '2_grams_two_class', '3_grams_two_class'
    ]:
    X = df.cleaned_lyrics
    
    if vectorizer_name == 'bag_of_words_two_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')])
    elif vectorizer_name == 'tf_idf_two_class':
        vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')])
    elif vectorizer_name == '2_grams_two_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')], ngram_range=(1,2))
    elif vectorizer_name == '3_grams_two_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')], ngram_range=(1,3))
    
    VectorizerPipeline(
        vectorizer_name, vectorizer, X, y_popular
    ).run_vectorizer_pipeline()



Train shape: (22743, 2198)             
Validation shape: (7581, 2198)             
Test shape: (7581, 2198)


PicklingError: Can't pickle <function <lambda> at 0x7fb5aa327700>: it's not found as __main__.<lambda>

In [25]:
for vectorizer_name in [
    'bag_of_words_three_class', 'tf_idf_three_class', '2_grams_three_class', '3_grams_three_class'
    ]:
    X = df.cleaned_lyrics
    
    if vectorizer_name == 'bag_of_words_three_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')])
    elif vectorizer_name == 'tf_idf_three_class':
        vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')])
    elif vectorizer_name == '2_grams_three_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')], ngram_range=(1,2))
    elif vectorizer_name == '3_grams_three_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')], ngram_range=(1,3))
    
    VectorizerPipeline(
        vectorizer_name, vectorizer, X, y_popularity
    ).run_vectorizer_pipeline()



Train shape: (22743, 2152)             
Validation shape: (7581, 2152)             
Test shape: (7581, 2152)
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/bag_of_words_three_class/data.pkl as a dictionary.




Train shape: (22743, 2143)             
Validation shape: (7581, 2143)             
Test shape: (7581, 2143)
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/tf_idf_three_class/data.pkl as a dictionary.




Train shape: (22743, 2591)             
Validation shape: (7581, 2591)             
Test shape: (7581, 2591)
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/2_grams_three_class/data.pkl as a dictionary.




Train shape: (22743, 2587)             
Validation shape: (7581, 2587)             
Test shape: (7581, 2587)
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/3_grams_three_class/data.pkl as a dictionary.


# Combining all Vectorizers

We also try to combine all the vectorizers as another representation of the lyrics.

In [26]:
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

X = df.cleaned_lyrics

# Instantiate a list of tuples - each tuple is the name of the transform + the transformer
vectorizers = [
    ('count_vect', CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')], ngram_range=(1,3))), 
    ('tfidf', TfidfVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')]))]

# Create feature union
featunion = FeatureUnion(vectorizers)

VectorizerPipeline(
    'all_vectorizers_three_class', featunion, X, y_popularity
).run_vectorizer_pipeline()



Train shape: (22743, 4778)             
Validation shape: (7581, 4778)             
Test shape: (7581, 4778)
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/all_vectorizers_three_class/data.pkl as a dictionary.


# Converting Into LexVec Word Embeddings

Here we provide the process of averaging LexVec Word Embeddings for each word in the lyrics to form a single document embedding. 

In [6]:
stop_words = text.ENGLISH_STOP_WORDS

In [7]:

import gensim

# Instantiate the LexVec Embeddings.
model = gensim.models.KeyedVectors.load_word2vec_format(
    DATA_PATH / 'lexvec-wikipedia-word-vectors', binary=False
)

def lyric2vec(lyric):
    """
    Embed a lyric by averaging the word vectors of the lyrics for each song. 
    Out-of-vocabulary words are replaced by a zero-vector.
    -----
    
    Input: lyric (string)
    Output: document embedding vector (np.array)
    """
    

    word_embeddings = [np.zeros(300)]
    for word in lyric:
        # If word is in stop words ignore it.
        if word in stop_words:
            continue
        # if the word is in the model then embed
        elif word in model:
            vector = model[word]
        # add zeros for out-of-vocab words
        else:
            vector = np.zeros(300)
            
        word_embeddings.append(vector)
    
    # average the word vectors
    sentence_embedding = np.stack(word_embeddings).mean(axis=0)
    sentence_embedding.reshape(1,300)
    
    return sentence_embedding


# Average the word embeddings in each lyric to get the document embedding. 
word_embedding_lyrics = [
    lyric2vec(lyric)
    for lyric in df['cleaned_lyrics']
]

final_lexvec = np.array(word_embedding_lyrics)

In [17]:
final_lexvec.shape

(37905, 300)

After we have all the embeddings we dump the embeddings using `joblib`.

In [18]:
import joblib

LEXVEC_PATH = utils.get_datapath('lexVec_data')

with open(
    LEXVEC_PATH / 'lexVec.pkl',
    'wb'
) as f:
    joblib.dump(final_lexvec, f)
    
    print(f"LexVec data dumped at {LEXVEC_PATH / 'lexVec.pkl'}")

# Open AI Embeddings

The final transformation of the lyrics to try is using an Open AI embedding model. 

To do this we need to first initialize our connection to the Open AI api. 

In [7]:
import tiktoken
import openai

# Reading the OpenAI api key. 
with open(DATA_PATH / 'open_ai_key.txt', 'r') as file:
    openai_api_key = file.readline()

openai.api_key = openai_api_key

# Setting the model parameters for the embeddings.
embedding_model = 'text-embedding-ada-002'
embedding_encoding = 'cl100k_base' # Tokenizer for the above ada embedding model. 
max_tokens = 8000 # This is the max token limit for the Ada Embedding.

df = pd.read_csv(DATA_PATH / 'clean_lyrics_and_spotify.csv')

In [23]:
df.shape

(35908, 14)

After intializing our connection to the Open AI api, we need to make sure all our lyrics stay below the 8000 token limit. 

In [24]:
encoding = tiktoken.get_encoding(embedding_encoding)

In [25]:
df['n_tokens'] = df['cleaned_lyrics'].apply(lambda x: len(encoding.encode(x)))

In [26]:
# Check to limit lyrics to those below the max token limit for the second generation ada embedding model.
df = df[
    df['n_tokens'] <= max_tokens
]

In [28]:
df.shape

(35901, 15)

Now that we have filtered out all the songs that are above the token limit we can proceed to access the api and get the document embeddings for our dataset. 

In [12]:
from tenacity import retry, stop_after_attempt, wait_exponential
# These packages are used to prevent us from reaching the rate limit when accessing the API. 

In [13]:
# This function creates an Ada embedding given text as input. 
# The decorator allows us to pause the number of requests that we are making to the api
# if we reach the rate limit. 
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=30))
def get_embedding(text, engine="text-embedding-ada-002"):
    response = openai.Embedding.create(input=[text], model=engine)
    embeddings = response['data'][0]['embedding']
    return embeddings

In [29]:
from tqdm import tqdm
tqdm.pandas()

In [30]:
df['ada_embeddings'] = df['cleaned_lyrics'].progress_apply(lambda x: get_embedding(x))

100%|██████████| 35901/35901 [2:07:10<00:00,  4.70it/s]  


In [35]:
df.to_csv(DATA_PATH / 'clean_lyrics_spotify_ada.csv')

# Conclusion

Now that we have all our representations we can go into some preliminary modeling. Note the Ada embeddings were not included in the preliminary modeling in `notebooks/3_prelim_modelling.ipynb`, as they were added after the preliminary modeling was already completed. 