In [2]:
import pandas as pd

# reading text-processed data
# df=pd.read_csv("https://raw.githubusercontent.com/Alex-Mak-MCW/SpotifyDataScienceProject/main/Data/NLP_processed_text.csv", encoding='utf-8')

# df=pd.read_csv("https://raw.githubusercontent.com/Alex-Mak-MCW/SpotifyDataScienceProject/main/Data/text_processed_data.csv", encoding='utf-8')

# read test dataset
# df=pd.read_csv("https://raw.githubusercontent.com/Alex-Mak-MCW/SpotifyDataScienceProject/main/Data/PROTO_text.csv", encoding='utf-8')

# reading data directly from textpreprocessing.ipynb
# df=pd.read_csv("https://raw.githubusercontent.com/Alex-Mak-MCW/SpotifyDataScienceProject/main/Sentiment_Analysis/Final_processed_text.csv", encoding='utf-8')

df=pd.read_csv("https://raw.githubusercontent.com/Alex-Mak-MCW/SpotifyDataScienceProject/main/Data/Sentiment_Analysis/test_English.csv", encoding='Utf-8')

In [38]:
# verify dataset
print(df.shape)
df.head()

(1621, 21)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,mood
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,-5.934,1.0,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,4.0,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,4.0,1
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,-5.679,1.0,0.0319,0.751,0.0,0.105,0.446,109.891,4.0,0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,-4.368,1.0,0.0575,0.533,0.0,0.0685,0.925,89.96,4.0,1
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,-5.338,0.0,0.0557,0.342,0.00101,0.311,0.662,173.93,4.0,1


## Technique 1: 
### Count-vectorizer (CV) + Term Frequency-Inverse document Frequency (TF-IDF)

#### Count Vectorizer (CV)
* Convert text to array of token counts
* Create a vocabulary of all unique words, then count the frequency for each word.
* Helps to to learn patterns with positive or negative sentiment based on word frequency.
##### Pros
* Easy, fast, basic
##### Cons
* Ignore word context, large feature space, common words too heavy in weighting. 

#### Term Frequency-Inverse document Frequency (TF-IDF)
* Extends CV, consider the importance of a word in the entire text.
* Determine how unique/common the term across all text.
* Weight shift common and rare words
##### Pros
* Balance word frequency and importance
##### Cons
* Still ignore word context, computationally expensive

In [39]:
# make copy of df

df_CV_TF_IDF=df.copy()

In [40]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np

# Custom transformer to select text column
get_text_data = FunctionTransformer(lambda x: x['lyrics'], validate=False)

# Pipeline for text processing
text_pipeline = Pipeline([
    ('selector', get_text_data),
    ('features', FeatureUnion([
        ('tfidf', Pipeline([
            ('count', CountVectorizer()),
            ('tfidf', TfidfTransformer())
        ]))
    ]))
])

# Get the feature names
text_features = text_pipeline.fit_transform(df_CV_TF_IDF)
feature_names = text_pipeline.named_steps['features'].transformer_list[0][1].named_steps['count'].get_feature_names_out()

# Omit the last 5 feature names
clean_feature_names = feature_names
# clean_feature_names = feature_names[:-25]

# Create a new CountVectorizer with the filtered feature names
clean_count_vectorizer = CountVectorizer(vocabulary=clean_feature_names)

# Update the pipeline with the new CountVectorizer
text_pipeline.named_steps['features'].transformer_list[0] = ('tfidf', Pipeline([
    ('count', clean_count_vectorizer),
    ('tfidf', TfidfTransformer())
]))

# Transform text data again with the updated pipeline
text_features = text_pipeline.fit_transform(df_CV_TF_IDF)

In [42]:
# Get feature names from TF-IDF
tfidf_vectorizer = text_pipeline.named_steps['features'].transformer_list[0][1].named_steps['count']
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_feature_names

array(['aa', 'aaaah', 'aaaall', ..., 'zuli', 'zz', 'zzzzzzz'],
      dtype=object)

In [43]:
# Access CountVectorizer
count_vectorizer = text_pipeline.named_steps['features'].transformer_list[0][1].named_steps['count']
print(count_vectorizer.get_feature_names_out())

['aa' 'aaaah' 'aaaall' ... 'zuli' 'zz' 'zzzzzzz']


In [44]:
# Reset index of both DataFrames
df_CV_TF_IDF.reset_index(drop=True, inplace=True)
# Set feature names as column names for text_features_df
text_features_df = pd.DataFrame(text_features.toarray(), columns=tfidf_feature_names)

# Perform a left merge to align rows properly
df_CV_TF_IDF = pd.merge(df_CV_TF_IDF, text_features_df, left_index=True, right_index=True, how='left', suffixes=('', '_text'))

In [45]:
print(df_CV_TF_IDF.shape)
df_CV_TF_IDF.head()

(1621, 13104)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,zoo,zoom,zoomin,zoowap,ztrip,zucchini,zulema,zuli,zz,zzzzzzz
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# export feature engineered dataset
df_CV_TF_IDF.to_csv('CV_and_TFIDF.csv', index=False)

## Technique 2: 
### Character Level N-Grams (Chosen N=3)

* Sequence of N items in a text (bigrams, trigrams etc)
##### Pros
* Pros: Capture word context and sequence, capture common phrase that indicate sentiment
##### Cons
* Expensive with large N, prone to overfit

In [56]:
# Define a function to generate character-level n-grams
def generate_char_ngrams(text_series, n):
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
    # Fit and transform the text data to get the n-grams
    ngrams_matrix = vectorizer.fit_transform(text_series)
    # Get the feature names (n-grams)
    ngrams = vectorizer.get_feature_names_out()
    # Convert the sparse matrix to a dense one and create a DataFrame
    ngrams_df = pd.DataFrame(ngrams_matrix.toarray(), columns=ngrams)
    return ngrams_df

In [58]:
df_N_Grams=df.copy()

In [59]:
print(df_N_Grams.shape)
df_N_Grams.head()

(1621, 21)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,mood
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,-5.934,1.0,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,4.0,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,4.0,1
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,-5.679,1.0,0.0319,0.751,0.0,0.105,0.446,109.891,4.0,0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,-4.368,1.0,0.0575,0.533,0.0,0.0685,0.925,89.96,4.0,1
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,-5.338,0.0,0.0557,0.342,0.00101,0.311,0.662,173.93,4.0,1


#### 2.1 N=1 gram (Unigram)

In [60]:
# Generate 1-grams (Unigrams)

# call generate function
char_1grams_df = generate_char_ngrams(df_N_Grams['lyrics'], 1)
# Concatenate the n-grams DataFrame with the original DataFrame
df_with_1grams = pd.concat([df_N_Grams.reset_index(drop=True), char_1grams_df], axis=1)

In [61]:
print(df_with_1grams.shape)
df_with_1grams.head()

(1621, 48)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,q,r,s,t,u,v,w,x,y,z
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,0,18,22,30,18,5,7,0,16,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,0,48,59,66,23,44,35,2,15,2
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,0,23,33,32,28,10,8,0,11,1
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,2,29,48,40,31,9,26,0,20,0
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,0,17,13,21,4,5,23,0,7,0


In [62]:
# export unigram feature engineered dataset
df_with_1grams.to_csv('1_Grams.csv', index=False)

#### 2.2 N=2 gram (Bigram)

In [63]:
# Generate 2-grams (Bigram)
# call generate function
char_2grams_df = generate_char_ngrams(df_N_Grams['lyrics'], 2)
# Concatenate the n-grams DataFrame with the original DataFrame
df_with_2grams = pd.concat([df_N_Grams.reset_index(drop=True), char_2grams_df], axis=1)

# # Generate 3-grams (Trigram)
# # call generate function
# char_2gram_df = generate_char_ngrams(df_N_Grams['lyrics'], 2)
# # Concatenate the n-grams DataFrame with the original DataFrame
# df_with_2gram = pd.concat([df_N_Grams.reset_index(drop=True), char_2gram_df], axis=1)

In [64]:
print(df_with_2grams.shape)
df_with_2grams.head()

(1621, 675)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,zl,zm,zn,zo,zt,zu,zv,zw,zy,zz
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,0,0,0,0,0,0,0,0,0,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,0,0,0,0,0,0,0,0,0,0
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,0,0,0,0,0,0,0,0,0,0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,0,0,0,0,0,0,0,0,0,0
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,0,0,0,0,0,0,0,0,0,0


In [65]:
# export unigram feature engineered dataset
df_with_2grams.to_csv('2_Grams.csv', index=False)

#### 2.1 N=3 gram (Trigram)

In [66]:
# Generate 3-grams (trigrams)
char_3grams_df = generate_char_ngrams(df_N_Grams['lyrics'], 3)

# print(char_3grams_df)

# Optionally, concatenate the n-grams DataFrame with the original DataFrame
df_with_3grams = pd.concat([df_N_Grams.reset_index(drop=True), char_3grams_df], axis=1)

In [67]:
print(df_with_3grams.shape)
df_with_3grams.head()

(1621, 6835)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,zyf,zyn,zz,zza,zzc,zze,zzi,zzl,zzo,zzz
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,0,0,0,0,0,0,0,0,0,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,0,0,0,0,0,0,0,0,0,0
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,0,0,0,0,0,0,0,0,0,0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,0,0,0,0,0,0,0,0,0,0
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,0,0,0,0,0,0,0,0,0,0


In [68]:
# export feature engineered dataset
df_with_3grams.to_csv('3_Grams.csv', index=False)

## Technique 3: 
### Word Embeddings

* Represent word in a continuous vector space, to capture semantic and syntactic meaning of words so similar meaning are place closely in a vector space

#### Contextual Relationships
* Train word embedding on large corpora where each word's context is considered
* Words with similar context -> similar vectors 
* Application: Word2Vec, FastText, GloVe (mention later)


#### Word2Vec uses...
* Continuous Bag of Words (CBOW): represent text by the frequency of each word in the library, then predicts a target word based on its surronding context word
* Skip-gram: predict context words from a rarget word


In [17]:
df_Embeddings=df.copy()

#### 3.1 FastText (currently only on google Colab)
* Extension of Word2Vec by Facebook
* Considers subword information (N-grams)
* Effective for morphologically rich langauges and rich words.

##### Pros
* Capture subword info, better handle rare words
* generate meaning embeddings even for misspelled/ morphologically rich words

##### Cons
* May not capture deep semantic context
* Pre-trained models may not fir specific domain vocab without fine-tuning.

In [None]:
df_FastText=df_Embeddings.copy()

In [None]:
from gensim.models import FastText

# Assuming df_fast_text is a pandas DataFrame containing the lyrics
# Tokenize text column
df_fast_text['tokenized_lyrics'] = df_fast_text['lyrics'].apply(lambda x: x.split())

# Prepare corpus
corpus = df_fast_text['tokenized_lyrics'].tolist()

# Initialize and train FastText model
fasttext_model = FastText(vector_size=100, window=5, min_count=1)
fasttext_model.build_vocab(corpus_iterable=corpus)
fasttext_model.train(corpus_iterable=corpus, total_examples=len(corpus), epochs=10)

# Example: Get vector for a word
word_vector = fasttext_model.wv['document']

# Example: Get vector for a sentence (average of word vectors)
def get_sentence_vector(sentence):
    words = sentence.split()
    word_vectors = [fasttext_model.wv[word] for word in words if word in fasttext_model.wv]
    return sum(word_vectors) / len(word_vectors) if word_vectors else []

# Assuming there's a text column in df_fast_text
df_fast_text['fasttext_vector'] = df_fast_text['lyrics'].apply(get_sentence_vector)

# Print the resulting vectors
print(df_fast_text['fasttext_vector'])

In [None]:
df_fast_text.to_csv("FastText.csv", index=False)

#### 3.2 GloVe (GLobal Vectors for Word Representation)
##### (currently not avaliable on local machine, can try Colab)

* Build word vectors by aggregate global word-word co-occurrence statistics from a corpus.
* Provides fixed-size dense vectors that capture semantic relationships between words. 
* Convert words into meaningful numerical representations that reflect their sentiment-related properties.

##### Pros
* Pre-trained on large corpora, offering good generalization.
* Computationally efficient for downstream tasks.

##### Cons
* Static embeddings (same vector for a word regardless of context).
* May not handle polysemy (words with multiple meanings) well.
* Compuationally expensive to train from scratch.

In [None]:
# To be added


#### 3.3 BERT (Bidirectional Encoder Representations from Transformers):

##### Uses Contextual Embeddings
* Represent words in a way that depends on the context they appear.
* Provide different representation for words based on their surrounding words
* Capture dynamic meaning of words

##### What it does...

* Built based on transfromer architecture 
* Bidirectional approach (looks at context ledt and right) to learn representation of each word.

##### Pros
* Generates context-aware embeddings, improving understanding of word meaning in context.
* State-of-the-art performance in sentiment analysis and other NLP tasks.
* Fine-tunable for specific tasks, enhancing adaptability.

##### Cons
* Computationally expensive to train and apply.
* Requires large amounts of memory and processing power.
* Pre-trained models might need substantial fine-tuning for specific domains.

In [8]:
df_BERT=df.copy()

In [9]:
print(df_BERT.shape)
df_BERT.head()

(1621, 21)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,mood
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,-5.934,1.0,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,4.0,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,4.0,1
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,-5.679,1.0,0.0319,0.751,0.0,0.105,0.446,109.891,4.0,0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,-4.368,1.0,0.0575,0.533,0.0,0.0685,0.925,89.96,4.0,1
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,-5.338,0.0,0.0557,0.342,0.00101,0.311,0.662,173.93,4.0,1


In [10]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example: Get vector for a sentence using BERT
def get_bert_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

df_BERT['bert_vector'] = df_BERT['lyrics'].apply(get_bert_embedding)
print(df_BERT['bert_vector'])


0       [-0.051606975, 0.059196685, 0.8179265, -0.1613...
1       [-0.25514305, -0.26647133, 0.80487204, -0.1589...
2       [-0.20082025, -0.05428033, 0.6784714, 0.129567...
3       [-0.1918617, 0.11497808, 0.6076808, -0.1166371...
4       [-0.05787892, 0.21525346, 1.042814, -0.1600381...
                              ...                        
1616    [-0.16968827, 0.0020094733, 0.61205995, -0.145...
1617    [0.04698473, 0.037607603, 0.89777786, -0.12191...
1618    [-0.45085692, 0.19648445, 1.0196003, -0.113810...
1619    [-0.3800811, -0.15651153, 0.7316858, -0.078895...
1620    [-0.24994217, 0.067083925, 0.606919, -0.025780...
Name: bert_vector, Length: 1621, dtype: object


In [11]:
# export BERT dataset
df_BERT.to_csv("BERT_Data.csv", index=False)

## Technique 4: 
### Sentiment Intensity Analyzer (VADER)

#### Valence Aware Dictionary and Sentiment Reasoner

* Lexicon & rule-based sentiment analusis tool.
* Provide sentiment score (Positive, negative, neutral, compound) based on intensity of words & their context.
* Can be used alone or with other feature engineering tools for NLP tasks.

##### Pros
* Designed for sentiment analysis, especially in social media contexts.
* Easy to use, integrate, and be interpretted.
* Effective for short and informal texts.

##### Cons
* Rule-based approach may not generalize well to all types of text.
* Limited in handling complex sentences and sarcasm.
* Performance may degrade on domain-specific or longer texts without additional tuning.


In [3]:
df_VADER=df.copy()

In [4]:
print(df_VADER.shape)
df_VADER.head()

(1621, 21)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,mood
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,-5.934,1.0,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,4.0,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,4.0,1
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,-5.679,1.0,0.0319,0.751,0.0,0.105,0.446,109.891,4.0,0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,-4.368,1.0,0.0575,0.533,0.0,0.0685,0.925,89.96,4.0,1
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,-5.338,0.0,0.0557,0.342,0.00101,0.311,0.662,173.93,4.0,1


In [5]:
# VADER

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to apply VADER and return the sentiment scores
def get_vader_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return pd.Series(scores)

# Apply the function to the text column and create new columns for each score
df_VADER[['negative', 'neutral', 'positive', 'compound']] = df_VADER['lyrics'].apply(get_vader_sentiment)

In [6]:
print(df_VADER.shape)
df_VADER.head()

(1621, 25)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,mood,negative,neutral,positive,compound
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,9.5e-05,0.0897,0.334,171.005,4.0,0,0.141,0.726,0.133,-0.3182
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,0.0,0.0931,0.931,95.977,4.0,1,0.009,0.574,0.417,0.9996
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,0.0,0.105,0.446,109.891,4.0,0,0.135,0.58,0.285,0.9852
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,0.0,0.0685,0.925,89.96,4.0,1,0.208,0.648,0.144,-0.8979
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,0.00101,0.311,0.662,173.93,4.0,1,0.0,0.753,0.247,0.9538


In [7]:
df_VADER.head().to_csv("VADER.csv", index=False)

## Technique 5: 
### LDA (topic modeling)
* Identifies topics in a collection of documents by observing the distribution of words.
* Discover underlying themes or topics within text data, then use those themes/ topics as additional features.
* Those themes/ topics might be correlated to sentiments.

##### Pros
* Unsupervised method to discover hidden topics in text.
* Provide insights into the thematic structure of a corpus.
* Useful for exploring and summarizing large text datasets.

##### Cons
* Assumes a fixed number of topics, hence requiring domain knowledge or trial-and-error to set.
* Topics may not always correspond to clear or coherent themes.
* Computationally intensive especially for large text.
* Not specifically tailored for sentiment analysis, requiring additional steps to link topics to sentiment.

In [31]:
df_LDA=df.copy()

In [32]:
print(df_LDA.shape)
df_LDA.head()

(1621, 21)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,mood
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,-5.934,1.0,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,4.0,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,4.0,1
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,-5.679,1.0,0.0319,0.751,0.0,0.105,0.446,109.891,4.0,0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,-4.368,1.0,0.0575,0.533,0.0,0.0685,0.925,89.96,4.0,1
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,-5.338,0.0,0.0557,0.342,0.00101,0.311,0.662,173.93,4.0,1


In [33]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel

# Download NLTK stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing function
def preprocess(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Remove stop words
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
    return tokens

# Apply preprocessing to the text column
df_LDA['processed_lyrics'] = df_LDA['lyrics'].apply(preprocess)

# Create Dictionary
id2word = corpora.Dictionary(df_LDA['processed_lyrics'])

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in df_LDA['processed_lyrics']]

# Build LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=42, update_every=1, chunksize=10, passes=25, alpha='auto', per_word_topics=True)

# Print the topics
topics = lda_model.print_topics()
for topic in topics:
    print(topic)

# Assign topics to each document
def assign_topic(doc):
    bow = id2word.doc2bow(doc)
    topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topics = sorted(topics, key=lambda x: -x[1])
    return topics[0][0]

df_LDA['topic'] = df_LDA['processed_lyrics'].apply(assign_topic)

[nltk_data] Downloading package punkt to /Users/alexmak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexmak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(0, '0.107*"ooh" + 0.059*"danc" + 0.046*"life" + 0.028*"cri" + 0.024*"whole" + 0.024*"done" + 0.022*"move" + 0.022*"lord" + 0.020*"god" + 0.020*"water"')
(1, '0.052*"know" + 0.034*"say" + 0.030*"feel" + 0.025*"ey" + 0.020*"time" + 0.020*"think" + 0.020*"one" + 0.019*"would" + 0.018*"like" + 0.018*"could"')
(2, '0.219*"hey" + 0.063*"heaven" + 0.036*"sick" + 0.035*"forc" + 0.023*"wave" + 0.020*"silver" + 0.015*"dollar" + 0.015*"fresh" + 0.014*"lot" + 0.014*"rhythm"')
(3, '0.102*"long" + 0.035*"cloth" + 0.030*"gaga" + 0.024*"moonlight" + 0.023*"hoo" + 0.020*"reckless" + 0.018*"cheap" + 0.015*"kitchen" + 0.014*"nanana" + 0.011*"poison"')
(4, '0.143*"okay" + 0.078*"somebodi" + 0.060*"fire" + 0.023*"stori" + 0.022*"town" + 0.020*"messag" + 0.020*"beneath" + 0.020*"somewher" + 0.019*"blow" + 0.017*"flower"')
(5, '0.079*"yeah" + 0.076*"want" + 0.060*"got" + 0.049*"go" + 0.035*"babi" + 0.032*"like" + 0.029*"girl" + 0.025*"get" + 0.015*"uh" + 0.014*"know"')
(6, '0.161*"love" + 0.062*"let" + 0.05

In [34]:
print(df_LDA[['lyrics', 'topic']])

                                                 lyrics  topic
0     yeah tryna call long enough mayb show love may...      8
1     club best place find lover bar go mm friend ta...      8
2     go time fear one save noth realli got way driv...      1
3     ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...      1
4     come harri want say goodnight holdin back grav...      1
...                                                 ...    ...
1616  call phone today ask speak normal somehow stil...      8
1617  complic alway way goe feel like wait long wond...      5
1618  realli rope time fight life never said goodby ...      1
1619  hard part alway seem last forev sometim forget...      1
1620  johanna drove slowli citi hudson river fill sn...      8

[1621 rows x 2 columns]


In [35]:
print(df_LDA.shape)
df_LDA.head()

(1621, 23)


Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,mood,processed_lyrics,topic
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,yeah tryna call long enough mayb show love may...,0.514,0.73,...,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,4.0,0,"[yeah, tryna, call, long, enough, mayb, show, ...",8
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,club best place find lover bar go mm friend ta...,0.825,0.652,...,0.0802,0.581,0.0,0.0931,0.931,95.977,4.0,1,"[club, best, place, find, lover, bar, go, mm, ...",8
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,go time fear one save noth realli got way driv...,0.501,0.405,...,0.0319,0.751,0.0,0.105,0.446,109.891,4.0,0,"[go, time, fear, one, save, noth, realli, got,...",1
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,ayi ayi ayi ayi ooh ooh ooh ooh ooh ooh ayi ay...,0.755,0.522,...,0.0575,0.533,0.0,0.0685,0.925,89.96,4.0,1,"[ayi, ayi, ayi, ayi, ooh, ooh, ooh, ooh, ooh, ...",1
4,As It Was,Harry Styles,Harry's House,2022-05-20,167303,91,0,come harri want say goodnight holdin back grav...,0.52,0.731,...,0.0557,0.342,0.00101,0.311,0.662,173.93,4.0,1,"[come, harri, want, say, goodnight, holdin, ba...",1


In [36]:
df_LDA.to_csv("LDA.csv", index=False)