### *Code Reference Note*
*This code snippet primarily draws from the core structure and pipeline design of Professor **Carly Bobak**'s Spotify Project. The sentiment analysis component references the NRC Emotion Lexicon, as detailed [here](https://github.com/Franck-Dernoncourt/NRC_Emotion_Lexicon).*

# Lyrics Analysis-NLP

## Merge Data

In [1]:
import pandas as pd
import os
import re
# Load the dataset
playlist_df = pd.read_csv('unique_playlist_info.csv')

# Display the first few rows of the dataframe to verify it's loaded correctly
playlist_df.head()

Unnamed: 0,id,album,name,artist,popularity,genre,danceability,energy,key,loudness,...,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
0,5dutmcNWUzaMvPMxK6hqfz,Night Visions (Expanded Edition / Super Deluxe),Demons,Imagine Dragons,44,"modern rock, pop, rock",0.488,0.706,3,-3.127,...,0.00035,0.297,0.367,90.009,audio_features,spotify:track:5dutmcNWUzaMvPMxK6hqfz,https://api.spotify.com/v1/tracks/5dutmcNWUzaM...,https://api.spotify.com/v1/audio-analysis/5dut...,2.95845,4
1,3JpTsmtxg3lEHmQMkVYW2y,yyyyyyyyyyyyyyyyyyy,yyyyyyyyyyyyyyyyyyy,Collectively Adrift,0,,0.532,0.151,9,-19.673,...,0.000189,0.097,0.701,132.85,audio_features,spotify:track:3JpTsmtxg3lEHmQMkVYW2y,https://api.spotify.com/v1/tracks/3JpTsmtxg3lE...,https://api.spotify.com/v1/audio-analysis/3JpT...,1.421067,5
2,37F0uwRSrdzkBiuj0D5UHI,Starboy,Reminder,The Weeknd,80,"canadian contemporary r&b, canadian pop, pop",0.705,0.505,8,-6.923,...,0.0,0.164,0.388,160.053,audio_features,spotify:track:37F0uwRSrdzkBiuj0D5UHI,https://api.spotify.com/v1/tracks/37F0uwRSrdzk...,https://api.spotify.com/v1/audio-analysis/37F0...,3.648,4
3,3pv7Q5v2dpdefwdWIvE7yH,1989 (Taylor's Version),Shake It Off (Taylor's Version),Taylor Swift,72,pop,0.632,0.805,7,-5.707,...,2.5e-05,0.156,0.903,160.052,audio_features,spotify:track:3pv7Q5v2dpdefwdWIvE7yH,https://api.spotify.com/v1/tracks/3pv7Q5v2dpde...,https://api.spotify.com/v1/audio-analysis/3pv7...,3.653483,4
4,0ApE8Ao0xAcvbSrIcRlMQz,SNAP PACK,SNAP,Rosa Linn,43,alt z,0.565,0.636,0,-8.198,...,1e-05,0.447,0.526,170.027,audio_features,spotify:track:0ApE8Ao0xAcvbSrIcRlMQz,https://api.spotify.com/v1/tracks/0ApE8Ao0xAcv...,https://api.spotify.com/v1/audio-analysis/0ApE...,2.992517,4


In [2]:
import string
# Add all lyrics texts' filenames to a dataframe
directory = 'lyrics'

# List all files in the directory
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# Create a DataFrame with these filenames
nrc_df = pd.DataFrame(files, columns=['filename'])

# Display the DataFrame
# print(nrc_df)

# Function to tokenize and clean text
def tokenize(text):
    lower_text=text.lower()
    # Replace non-alphabet characters with a space
    cleaned_text = ''.join([char if char in string.ascii_lowercase else ' ' for char in lower_text])

    text_tokens = cleaned_text.split()
    
    return text_tokens

# Tokenize file names in nrc_df
nrc_df['filename_tokens'] = nrc_df['filename'].apply(lambda x: tokenize(x.replace('txt', '')))

# Tokenize artist and name columns in playlist_df
playlist_df['artist_tokens'] = playlist_df['artist'].apply(tokenize)
playlist_df['name_tokens'] = playlist_df['name'].apply(tokenize)
playlist_df['tokens']=playlist_df['artist_tokens']+playlist_df['name_tokens']

# Function to check if all tokens are present in another list of tokens
def tokens_in_filename(playlist_tokens, filename_tokens):
    if filename_tokens==[] or playlist_tokens==[]:
        return False
    else:
        return all(token in playlist_tokens for token in filename_tokens) 

In [3]:

merged_data = []
for index, row in playlist_df.iterrows():
    playlist_tokens=row['tokens']
    for idx, nrc_row in nrc_df.iterrows():
        if tokens_in_filename(playlist_tokens, nrc_row['filename_tokens']):
            merged_row = {**nrc_row.to_dict(), **row.to_dict()}
            merged_data.append(merged_row)


merged_df = pd.DataFrame(merged_data)


merged_df = merged_df.drop(columns=['artist_tokens', 'name_tokens','filename_tokens','tokens'])


merged_df.to_csv('merged_sentiment_playlist.csv', index=False)


merged_df.head()

Unnamed: 0,filename,id,album,name,artist,popularity,genre,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
0,Imagine_Dragons_Demons.txt,5dutmcNWUzaMvPMxK6hqfz,Night Visions (Expanded Edition / Super Deluxe),Demons,Imagine Dragons,44,"modern rock, pop, rock",0.488,0.706,3,...,0.00035,0.297,0.367,90.009,audio_features,spotify:track:5dutmcNWUzaMvPMxK6hqfz,https://api.spotify.com/v1/tracks/5dutmcNWUzaM...,https://api.spotify.com/v1/audio-analysis/5dut...,2.95845,4
1,The_Weeknd_Reminder.txt,37F0uwRSrdzkBiuj0D5UHI,Starboy,Reminder,The Weeknd,80,"canadian contemporary r&b, canadian pop, pop",0.705,0.505,8,...,0.0,0.164,0.388,160.053,audio_features,spotify:track:37F0uwRSrdzkBiuj0D5UHI,https://api.spotify.com/v1/tracks/37F0uwRSrdzk...,https://api.spotify.com/v1/audio-analysis/37F0...,3.648,4
2,Taylor_Swift_Shake_It_Off_(Taylor's_Version).txt,3pv7Q5v2dpdefwdWIvE7yH,1989 (Taylor's Version),Shake It Off (Taylor's Version),Taylor Swift,72,pop,0.632,0.805,7,...,2.5e-05,0.156,0.903,160.052,audio_features,spotify:track:3pv7Q5v2dpdefwdWIvE7yH,https://api.spotify.com/v1/tracks/3pv7Q5v2dpde...,https://api.spotify.com/v1/audio-analysis/3pv7...,3.653483,4
3,Rosa_Linn_SNAP.txt,0ApE8Ao0xAcvbSrIcRlMQz,SNAP PACK,SNAP,Rosa Linn,43,alt z,0.565,0.636,0,...,1e-05,0.447,0.526,170.027,audio_features,spotify:track:0ApE8Ao0xAcvbSrIcRlMQz,https://api.spotify.com/v1/tracks/0ApE8Ao0xAcv...,https://api.spotify.com/v1/audio-analysis/0ApE...,2.992517,4
4,Ed_Sheeran_Shape_of_You.txt,7qiZfU4dY1lWllzX7mPBI3,÷ (Deluxe),Shape of You,Ed Sheeran,86,"pop, singer-songwriter pop, uk pop",0.825,0.652,1,...,0.0,0.0931,0.931,95.977,audio_features,spotify:track:7qiZfU4dY1lWllzX7mPBI3,https://api.spotify.com/v1/tracks/7qiZfU4dY1lW...,https://api.spotify.com/v1/audio-analysis/7qiZ...,3.895217,4


## NRC Sentiment Analysis

### **1.Categorize the songs**

In [4]:
# Simplize the genre data
merged_df['genre_simple'] = merged_df['genre'].apply(lambda x: re.split(r'[,\s]+', x) if isinstance(x, str) else [])

# Function to categorize lyrics by genre
def genre_lyric(genre):
    # Use DataFrame directly to filter the rows where the genre is in genre_simple
    genre_data = [row for _, row in merged_df.iterrows() if genre in row['genre_simple']]
    genre_df = pd.DataFrame(genre_data)
    
    # Select specific columns and add a final genre column
    genre_df = genre_df[['filename', 'name', 'artist']]
    genre_df['genre_final'] = genre
    
    return genre_df

#generate the 3 song lists accordingly
rb_df=genre_lyric("r&b")
pop_df=genre_lyric("pop")
rap_df=genre_lyric("rap")

### **2.Read in lyric files by Genre**

In [5]:
import os
import re

# Function to read in the text file
def read_text_files_with_names(df,folder_path):
    lyrics = []
    filenames=df['filename']
    for filename in filenames:
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            lyrics.append(file.read())
    return lyrics

#read in the lyris text file
folder_path = 'lyrics'
rb_raw_lyrics= read_text_files_with_names(rb_df,folder_path)
pop_raw_lyrics= read_text_files_with_names(pop_df,folder_path)
rap_raw_lyrics= read_text_files_with_names(rap_df,folder_path)

# Function to clean the text
def clean_text(text):
    text = ' '.join(text.split())
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Function to clean and split the lyrics
def lyric_tokenize(raw_lyrics):
    print(f"Number of songs read: {len(raw_lyrics)}")
    cleaned_lyrics = [clean_text(lyric) for lyric in raw_lyrics]
    tokenized_lyrics=[]
    for lyric in cleaned_lyrics:
        tokenized_lyrics = [*tokenized_lyrics, *lyric.split()]
    return tokenized_lyrics

rb_token=lyric_tokenize(rb_raw_lyrics)
pop_token=lyric_tokenize(pop_raw_lyrics)
rap_token=lyric_tokenize(rap_raw_lyrics)

Number of songs read: 23
Number of songs read: 110
Number of songs read: 38


### **3.Remove Stop Words**

In [6]:
import nltk
from nltk.corpus import stopwords

# Download the list of stopwords
#nltk.download('stopwords')

# Get English stop words
all_stop_words = set(stopwords.words('english'))

# Function to remove stop words from a list of tokens
def remove_stop_words(tokens, stop_words_set):
    return [token for token in tokens if token not in stop_words_set]

# Apply the function to our tokenized lyrics
cleaned_rb_token = remove_stop_words(rb_token, all_stop_words)
cleaned_pop_token = remove_stop_words(pop_token, all_stop_words)
cleaned_rap_token = remove_stop_words(rap_token, all_stop_words)

# Check the number of tokens in the r&b songs after removing stop words
print(f"Number of tokens in the R&B songs after removing stop words: {len(cleaned_rb_token)}")
print(f"Number of tokens in the  songs after removing stop words: {len(cleaned_pop_token)}")
print(f"Number of tokens in the r&b songs after removing stop words: {len(cleaned_rap_token)}")

Number of tokens in the R&B songs after removing stop words: 5441
Number of tokens in the  songs after removing stop words: 28255
Number of tokens in the r&b songs after removing stop words: 20916


### **4.Sentiment Analysis**


In [7]:
# Load NRC lexicon
nrc_lexicon = pd.read_csv('NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', sep='\t', header=None, names=['word', 'emotion', 'association'])
nrc_lexicon = nrc_lexicon.pivot(index='word', columns='emotion', values='association').fillna(0)

In [11]:
from collections import defaultdict

# Function to do NRC sentiment analysis
def nrc_full_spectrum_analysis(tokens):
    emotion_scores = defaultdict(int)
    token_emotions = defaultdict(list)
    
    for token in tokens:
        if token in nrc_lexicon.index:
            for emotion in nrc_lexicon.columns:
                score = nrc_lexicon.loc[token, emotion]
                if score > 0:
                    emotion_scores[emotion] += score
                    token_emotions[emotion].append(token)
    
    
    filtered_emotion_scores = {emotion: score for emotion, score in emotion_scores.items() if emotion not in ['positive', 'negative']}
    if filtered_emotion_scores:
        top_emotion = max(filtered_emotion_scores, key=filtered_emotion_scores.get)
    else:
        top_emotion = None  # Handle case where no valid emotions are detected
    
    return emotion_scores, top_emotion, token_emotions


# Apply NRC analysis to each cleaned token set
nrc_rb = nrc_full_spectrum_analysis(cleaned_rb_token)
nrc_pop = nrc_full_spectrum_analysis(cleaned_pop_token)
nrc_rap = nrc_full_spectrum_analysis(cleaned_rap_token)

# Function to generate datasets for each genre
def nrc_dataframe(nrc_result):
    emotion_scores, top_emotion, token_emotions = nrc_result
    top_emotion_words = ', '.join(set(token_emotions[top_emotion])) if top_emotion else ''
    data = {'top_emotion': [top_emotion], 'top_emotion_words': [top_emotion_words]}
    data.update({k: [v] for k, v in emotion_scores.items()})
    return pd.DataFrame(data)

# Create DataFrames
df_rb = nrc_dataframe(nrc_rb)
df_pop = nrc_dataframe(nrc_pop)
df_rap = nrc_dataframe(nrc_rap)
df_rb

Unnamed: 0,top_emotion,top_emotion_words,positive,negative,sadness,anticipation,joy,trust,surprise,anger,disgust,fear
0,joy,"erotic, comfort, art, romance, sweet, buss, pr...",484,377,236,302,317,239,129,197,167,187


### **Visulization: Word Cloud by Genre**
To analyze the lyrics of the most streamed songs we generate word cloud plot according to the genre label.
Here we choose 3 genres of "R&B","POP","RAP" and analyze the lyrics of a,b,c songs belonging to these genres respectively.

In [9]:
from wordcloud import WordCloud, get_single_color_func
import matplotlib.pyplot as plt

class SimpleGroupedColorFunc(object):
    """Create a color function object which assigns EXACT colors
       to certain words based on the position of the word."""

    def __init__(self, color_to_words, default_color):
        self.color_to_words = color_to_words
        self.default_color = default_color

    def __call__(self, word, **kwargs):
        return self.color_to_words.get(word, self.default_color)

# Define color schemes for each genre
colors_rb = ['#0C0F0A', '#003B46', '#07575B', '#66A5AD', '#C4DFE6']  # Cool blues and greens
colors_pop = ['#FF6F61', '#6B5B95', '#88B04B', '#F7CAC9', '#92A8D1'] # Vibrant and soft pastels
colors_rap = ['#FFD700', '#FF8C00', '#FFFFF0', '#FF6347', '#B22222'] # Bold and fiery tones

def generate_word_cloud(dataframe, genre, colors):
    # Join all the words in the top_emotion_words into a single string
    text = ' '.join(dataframe['top_emotion_words'].dropna().tolist())
    
    # Create a color function with single tone|
    color_func = get_single_color_func(colors[0])

    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', color_func=color_func).generate(text)

    base_dir="C:/Users/86139/PycharmProjects/pythonProject1/wordclouds/"
    save_path=f"{base_dir}{genre}_wordcloud.png"
    
    # Display the word cloud using matplotlib
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {genre} Genre')
    plt.axis("off")
    plt.savefig(save_path, format='png', bbox_inches='tight')  # Save as PNG with tight bounding box
    plt.close()

# Generate word clouds for each genre with specific colors
generate_word_cloud(df_rb, 'R&B', colors_rb)
generate_word_cloud(df_pop, 'Pop', colors_pop)
generate_word_cloud(df_rap, 'Rap', colors_rap)


### **Visulization: Pie Chart**

In [10]:
#!pip install seaborn
import seaborn as sns

# Combine the data into a single DataFrame for visualization
df_rb['genre'] = 'R&B'
df_pop['genre'] = 'Pop'
df_rap['genre'] = 'Rap'

# Concatenate into a single DataFrame
df_combined = pd.concat([df_rb, df_pop, df_rap], ignore_index=True)

# Aggregate data to get mean scores per genre for each emotion
emotion_cols = [col for col in df_combined.columns if col not in ['top_emotion', 'top_emotion_words', 'genre']]
df_emotion_avg = df_combined.groupby('genre')[emotion_cols].mean().reset_index()

# Define color schemes for each genre
color_schemes = {
    'R&B': ['#006769', '#40A578', '#9DDE8B', '#E6FF94','#D2FF72'],  # Cool blues and greens
    'Pop': ['#944E63', '#B47B84', '#CAA6A6', '#FFE7E7', '#D76C82'],  # Vibrant and soft pastels
    'Rap': ['#DC6B19','#C08B5C','#FFC94A', '#FFD95A', '#FFF7D4']   # Bold and fiery tones
}



# Create a function to plot a pie chart for a given genre with specific colors
def plot_pie_chart(data, genre, color_scheme,save_path):
    # Filter data for the selected genre
    genre_data = data[data['genre'] == genre].iloc[0]
    # Remove the 'genre' column to only have numerical values
    emotions = genre_data.drop('genre')
    
    # Generate pie chart using the specified color scheme
    fig, ax = plt.subplots()
    ax.pie(emotions, labels=emotions.index, autopct='%1.1f%%', startangle=90, colors=color_scheme)
    ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.title(f'Emotion Distribution for {genre} Genre')
    fig.savefig(save_path, format='png', bbox_inches='tight')  # Save as PNG with tight bounding box
    plt.close(fig)
    
base_dir ="C:/Users/86139/PycharmProjects/pythonProject1/piecharts/"

# Plot pie charts for each genre using the defined color schemes
genres = df_emotion_avg['genre']
for genre in genres:
    save_path=f"{base_dir}{genre}_pie_chart.png"
    plot_pie_chart(df_emotion_avg, genre, color_schemes[genre],save_path)
