# Feature Engineering / NLP

In [74]:
import re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
_
# NLP
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import pos_tag
from unicodedata import normalize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
_
# LLM
from openai import OpenAI
from ast import literal_eval
_
# Web scraping
from bs4 import BeautifulSoup
import requests
_
# Misc
import sys
sys.path.append('/Users/********')
import cred

In [75]:
df = pd.read_csv('raw_data/lyrics_raw.csv')
_
df.head()

Unnamed: 0,track_name,artist,duration_ms,is_explicit,track_uri,track_bpm,raw_lyrics,artist_verses
0,You Broke My Heart,Drake,230709,False,spotify:track:2FBo7mg3It3YOefMhBKk60,123.872,"Yeah Yeah Oh, baby, don't go Don't go Look Loo...","Yeah\nYeah\nOh, baby, don't go\nDon't go\nLook..."
1,First Person Shooter (feat. J. Cole),Drake,247444,False,spotify:track:6ltWTSzsV2TaSsOhoA8eBC,164.005,"[Intro: J. Cole & Adonis] (Pew, pew-pew) Firs...",Big as the what? (Ah)\nBig as the what? (Mm)\n...
2,MELTDOWN (feat. Drake),Drake,246133,True,spotify:track:67nepsnrcZkowTxMWigSbb,111.975,[Intro: Drake] Yeah Tensions is definitely ri...,Yeah\nTensions is definitely rising\nT'd up ri...
3,Slime You Out (feat. SZA),Drake,310490,True,spotify:track:3RaCGXCiiMufRPoexXxGkV,88.88,I don't know I don't know what's wrong with yo...,I don't know\nI don't know what's wrong with y...
4,On The Radar Freestyle,Drake,275280,True,spotify:track:1L0OmJV0LnPDWuHCIGmBV6,150.126,"Yeah On The Radar, 2023 6 G-O-D and my broski ...","Yeah\nOn The Radar, \n G-O-D and my broski \nC..."


Convert `duration_ms` to minutes:

In [76]:
df['duration_min'] = round(df['duration_ms'] / 60000)

Featured on track:

In [77]:
df['is_featured'] = df.apply(lambda x: x['artist'] in x['track_name'], axis=1)

Bar count:

In [78]:
# A line counts as a bar — if the line contains than 4 words
df['bar_count'] = df['artist_verses'].apply(
    lambda x: len([bar for bar in x.split('\n') if len(bar.split(' ')) > 4])
)

Word count and words per min:

In [79]:
df['word_count'] = df['artist_verses'].apply(lambda x: len(" ".join(x.split('\n')).split(' ')))
df['words_per_min'] = round(df['word_count'] / df['duration_min'], 1)

Stop words used and stop word count:

In [80]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    lyrics = row['artist_verses']
    # Format text:
    lyrics = lyrics.lower()
    lyrics = lyrics.replace(',', '').replace('\n', ' ')
    lyrics = lyrics.split(' ')
    
    stop_word_count = 0
    
    for word in lyrics:
        if word in set(stopwords.words('english')):
            stop_word_count += 1

    df.loc[index, 'stop_word_count'] = stop_word_count

  0%|          | 0/530 [00:00<?, ?it/s]

In [81]:
df.sample(3)

Unnamed: 0,track_name,artist,duration_ms,is_explicit,track_uri,track_bpm,raw_lyrics,artist_verses,duration_min,is_featured,bar_count,word_count,words_per_min,stop_word_count
239,Drankin N Smokin,Future,213520,True,spotify:track:0k7wmahjkn389wAZdz19Cv,151.951,"Saggin' out the malls, we do the most, oh Had ...","Saggin' out the malls, we do the most, oh\nHad...",4.0,False,41,459,114.8,233.0
124,Woman Like Me (feat. Nicki Minaj),Nicki Minaj,229720,True,spotify:track:222B3aljXx6fFwKmFjuBED,75.002,I always say what I'm feeling I was born witho...,"Yo, woman like me, yes, a woman like me (Oh)\n...",4.0,True,24,233,58.2,85.0
4,On The Radar Freestyle,Drake,275280,True,spotify:track:1L0OmJV0LnPDWuHCIGmBV6,150.126,"Yeah On The Radar, 2023 6 G-O-D and my broski ...","Yeah\nOn The Radar, \n G-O-D and my broski \nC...",5.0,False,43,567,113.4,239.0


### Scraping Wikipedia

Get a list of all English swear words:

In [82]:
URL = 'https://en.wiktionary.org/wiki/Category:English_swear_words'

In [83]:
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
swear_words = [
    word[:-1] if word[-1].isupper() else word \
    for word in soup.find_all('div', {'class': 'mw-category mw-category-columns'})[0].text.split('\n')[2:]
] + ['libs']

### NLP

Remove all symbols / special characters from `artist_verses` and lemmatize text:

In [84]:
def get_wordnet_pos(treebank_tag):
    """
    Converts part-of-speech tag into wordnet tag if able.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def lemmatize_text(text: str) -> str:
    """
    Tokenize, lemmatize, and remove stop words from text.
    """
    global swear_words
    
    lem = WordNetLemmatizer()
    stop_words = set(
        stopwords.words('english') + \
        ["n't", "v'e", "r'e", "'re", "'ve", "'", "'s", "'ll", "'m", "can't", "ain't", "i'm", "i'd", "ll"]
    )

    # # Remove stop words before lemmatization:
    text = " ".join(word for word in text.split(' ') if word not in stop_words)
    text = text.replace("'", '')
    text = re.sub("\s+", ' ', text)  # Remove additional whitespaces

    # Run spell corrections:
    text = " ".join(
        str(TextBlob(word).correct()) if word not in swear_words else word \
        for word in text.split(' ')
    )
    
    # Lemmatize and remove stop words:
    word_tags = pos_tag(word_tokenize(text))
    text = [
        lem.lemmatize(word[0], pos=get_wordnet_pos(word[1])) \
        if word[1].startswith(('J', 'V', 'N', 'R')) else word[0] for word in word_tags
    ]

    # Remove stop words after lemmatization:
    text = [word for word in text if word not in stop_words]
    text = " ".join(text)
    return text
    

def clean_text(text: str) -> str:
    """
    Remove punctuation, numbers, and special characters.
    """
    text = text.lower()

    text = re.sub("[^\w']+", ' ', text)  # Remove special symbols, excluding apostrophes
    text = re.sub("[0-9]", '', text)  # Removes numbers
    text = re.sub("\s+", ' ', text)  # Remove additional whitespaces
    return text

In [85]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    lyrics = row['artist_verses']
    lyrics_cleaned = clean_text(lyrics)
    lyrics_lemmatized = lemmatize_text(lyrics_cleaned)
    
    df.loc[index, 'verses_cleaned'] = lyrics_cleaned
    df.loc[index, 'verses_lemmatized'] = lyrics_lemmatized

  0%|          | 0/530 [00:00<?, ?it/s]

### Sentiment analysis

Get positive / negative polarity scores for each track:

In [86]:
sia = SentimentIntensityAnalyzer()
_
df['pos_sentiment'] = df['verses_lemmatized'].apply(lambda x: sia.polarity_scores(x)['pos'])
df['neg_sentiment'] = df['verses_lemmatized'].apply(lambda x: sia.polarity_scores(x)['neg'])

### OpenAI API

Categorise lyrics using Chat-GPT-4:

In [87]:
def categorise_lyrics(text: str) -> str:
    """
    Use OpenAI to classfify lyrics into a specific category. 
    """
    client = OpenAI(api_key=cred.OPENAPI_KEY)
    
    system_message = """
    Your job is to use probability to categorize lyrics into one of these categories: Love, Violence, 
    Drugs, Competitive, Sex, Hopeful. 
    
    IMPORTANT: Only return the category with the highest probability.
    """

    parsed_output = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": text}
        ],
        model="gpt-4",
    )

    return parsed_output.choices[0].message.content

In [89]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    try:
        category = categorise_lyrics(row['artist_verses'])
        df.loc[index, 'topic'] = category
    except Exception as e:
        print(f"ERROR at index pos: {index} — {e}")

  0%|          | 0/530 [00:00<?, ?it/s]

ERROR at index pos: 338 — Error code: 429 - {'error': {'message': 'Request too large for gpt-4 in organization org-AIsSZrr9c0qDVeRXnBwtK8Bp on tokens per min (TPM): Limit 10000, Requested 18712. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
ERROR at index pos: 436 — Error code: 429 - {'error': {'message': 'Request too large for gpt-4 in organization org-AIsSZrr9c0qDVeRXnBwtK8Bp on tokens per min (TPM): Limit 10000, Requested 18903. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
ERROR at index pos: 486 — Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8428 tokens. Please reduce the length 

Drop any NaN values:

In [90]:
df = df.dropna()

### Compile lyric corpus

Gather each artist's vocabulary:

In [92]:
corpus_dict = {}

for artist in df['artist'].unique():
    artist_vocab = " ".join(df[df['artist'] == artist]['verses_cleaned'])
    corpus_dict[artist] = [artist_vocab]

# Convert to dataframe:
corpus_df = pd.DataFrame(corpus_dict).T.reset_index().rename(columns={'index': 'artist', 0: 'vocab'})
_
corpus_df

Unnamed: 0,artist,vocab
0,Drake,yeah yeah oh baby don't go don't go look look ...
1,Eminem,yeah it's been a minute this probably should h...
2,Nicki Minaj,she's alright that girl s alright with me yeah...
3,Nas,yeah yeah ayo black it's time word word it's t...
4,Future,gotta fire my joint up on this bitch young met...
5,Dave,take a look at these diamonds wrong it's a lif...
6,2Pac,pac changes feat talent pac dear mama pac do ...
7,Kendrick Lamar,love let's talk about love is it anything and ...
8,Rapsody,yeah reading sufi say time is money coining ru...
9,Skepta,who are you a carbon copy cause a fire hot hib...


### Save data

Reorder columns:

In [93]:
cols = [
    'track_name', 'artist', 'raw_lyrics', 'artist_verses', 'verses_cleaned', 'verses_lemmatized', 'topic',
    'pos_sentiment','neg_sentiment', 'word_count', 'words_per_min', 'bar_count', 'stop_word_count', 'is_featured', 
    'track_bpm', 'duration_min', 'is_explicit',
]

df = df[cols]
_
df.head(1)

Unnamed: 0,track_name,artist,raw_lyrics,artist_verses,verses_cleaned,verses_lemmatized,topic,pos_sentiment,neg_sentiment,word_count,words_per_min,bar_count,stop_word_count,is_featured,track_bpm,duration_min,is_explicit
0,You Broke My Heart,Drake,"Yeah Yeah Oh, baby, don't go Don't go Look Loo...","Yeah\nYeah\nOh, baby, don't go\nDon't go\nLook...",yeah yeah oh baby don't go don't go look look ...,yeah yeah oh baby go go look look note catch m...,Love,0.23,0.392,556,139.0,58,203.0,False,123.872,4.0,False


In [94]:
df.to_csv('processed_data/lyrics_processed.csv', index=False)
corpus.to_csv('processed_data/artist_corupus.csv', index=False)