Import Libraries

In [1]:
import pandas as pd
import re
import unicodedata

#Data Collection
Load the first CSV file containing years 2000-2024

In [2]:
try:
    df_spotify = pd.read_csv('billboard_24years_lyrics_spotify.csv', encoding='utf-8')
except UnicodeDecodeError:
    df_spotify = pd.read_csv('billboard_24years_lyrics_spotify.csv', encoding='latin1')

Load the second CSV file containing years 1964-2015

In [3]:
try:
    df_1964_2015 = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='utf-8')
except UnicodeDecodeError:
    df_1964_2015 = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='latin1')

#Data Preprocessing

Filter for years 2016 to 2024

In [4]:
df_spotify_filtered = df_spotify[df_spotify['year'].between(2016, 2024)].copy()

Remove duplicate records based on all columns

In [5]:
df_spotify_filtered = df_spotify_filtered.drop_duplicates()

Function to clean lyrics

In [6]:
def clean_lyrics_ascii(text):
    if pd.isna(text):  # Handle missing values
        return ''
    text = text.lower() # Convert to lowercase

    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII characters

    text = re.sub(r'\s*\'\s*', '', text) # Remove apostrophes and merge contractions

    text = re.sub(r'[^a-z0-9\s]', ' ', text) # Remove all punctuation except spaces, keep letters and numbers

    text = re.sub(r'\d+embed$', '', text) # Remove number followed by "embed" at the end

    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with a single space and remove leading/trailing whitespace

    return text

df_spotify_filtered['lyrics_cleaned'] = df_spotify_filtered['lyrics'].apply(clean_lyrics_ascii)

Select and rename columns for consistency

In [7]:
df_spotify_cleaned = df_spotify_filtered[['song', 'band_singer', 'year', 'ranking', 'lyrics_cleaned']]

df_spotify_cleaned = df_spotify_cleaned.rename(columns={
    'band_singer': 'artist'
})

Rename columns to match the first dataset

In [8]:
df_1964_2015 = df_1964_2015.rename(columns={
    'Song': 'song',
    'Artist': 'artist',
    'Year': 'year',
    'Rank': 'ranking',
    'Lyrics': 'lyrics_cleaned'
})

Select only the required columns

In [9]:
df_1964_2015_cleaned = df_1964_2015[['song', 'artist', 'year', 'ranking', 'lyrics_cleaned']]

Combine the datasets

In [10]:
df_combined = pd.concat([df_1964_2015_cleaned, df_spotify_cleaned], ignore_index=True)

Remove duplicates across the combined dataset (based on all columns)

In [11]:
df_combined = df_combined.drop_duplicates()

Inspect the first few rows of the combined DataFrame

In [12]:
print("Combined Cleaned DataFrame (First 5 rows):")
print(df_combined.head())
print("\nOriginal vs Cleaned Lyrics (First 5 rows from 2016-2024 data):")
for i in range(min(5, len(df_spotify_filtered))):
    print(f"Song: {df_spotify_filtered['song'].iloc[i]} (Year: {df_spotify_filtered['year'].iloc[i]}, Rank: {df_spotify_filtered['ranking'].iloc[i]})")
    print(f"Original: {df_spotify_filtered['lyrics'].iloc[i]}")
    print(f"Cleaned: {df_spotify_filtered['lyrics_cleaned'].iloc[i]}")
    print("-" * 50)

Combined Cleaned DataFrame (First 5 rows):
                                       song                         artist  \
0                               wooly bully  sam the sham and the pharaohs   
1  i cant help myself sugar pie honey bunch                      four tops   
2                i cant get no satisfaction             the rolling stones   
3                       you were on my mind                        we five   
4              youve lost that lovin feelin         the righteous brothers   

   year  ranking                                     lyrics_cleaned  
0  1965        1  sam the sham miscellaneous wooly bully wooly b...  
1  1965        2   sugar pie honey bunch you know that i love yo...  
2  1965        3                                                     
3  1965        4   when i woke up this morning you were on my mi...  
4  1965        5   you never close your eyes anymore when i kiss...  

Original vs Cleaned Lyrics (First 5 rows from 2016-2024 data):
Song

Save the combined cleaned data to a new CSV file

In [13]:
df_combined.to_csv('billboard_1964_2024_lyrics_cleaned.csv', index=False)
print("Combined cleaned data saved to 'billboard_1964_2024_lyrics_cleaned.csv'")
print(f"Number of unique records: {len(df_combined)}")

Combined cleaned data saved to 'billboard_1964_2024_lyrics_cleaned.csv'
Number of unique records: 6280


Generating a basic score for judging a song's performance

In [14]:
import pandas as pd


df = pd.read_csv('billboard_1964_2024_lyrics_cleaned.csv')

# Group by song, artist, and year to ensure uniqueness
grouped = df.groupby(['song', 'artist']).agg({
    'ranking': list,   # List of rankings across years
    'year': list,      # List of years charted
    'lyrics_cleaned': 'first'  # Keep first instance of lyrics
}).reset_index()


def calculate_mycs(rankings, alpha=0.2):
    """
    Compute Multi-Year Chart Score for a song.

    Args:
        rankings (list): List of rankings (1-100) across years
        alpha (float): Longevity weight (default: 0.2)

    Returns:
        float: MYCS score
    """
    # Yearly rank scores: (101 - rank)/100
    yearly_scores = [(101 - rank)/100 for rank in rankings]
    total_rank_score = sum(yearly_scores)

    # Longevity multiplier
    years = len(rankings)
    longevity_multiplier = 1 + alpha * (years - 1)

    return total_rank_score * longevity_multiplier

grouped['MYCS'] = grouped['ranking'].apply(
    lambda x: calculate_mycs(x, alpha=0.2)
)

grouped = grouped.sort_values('MYCS', ascending=False)


output = grouped[['song', 'artist', 'year', 'ranking', 'MYCS', 'lyrics_cleaned']]

# Save to CSV
output.to_csv('song_mycs_scores.csv', index=False)
print("Saved results to song_mycs_scores.csv")


print("\nTop 5 Songs by MYCS:")
print(output.head(5)[['song', 'artist', 'year', 'ranking', 'MYCS']])


Saved results to song_mycs_scores.csv

Top 5 Songs by MYCS:
                                 song         artist  \
156                   Blinding Lights     The Weeknd   
2788                    how do i live    leann rimes   
861                              Stay  Justin Bieber   
862                              Stay  The Kid Laroi   
71    All I Want for Christmas Is You   Mariah Carey   

                          year           ranking   MYCS  
156               [2020, 2021]            [1, 3]  2.376  
2788              [1997, 1998]            [9, 5]  2.256  
861               [2021, 2022]           [12, 3]  2.244  
862               [2021, 2022]           [12, 3]  2.244  
71    [2020, 2021, 2022, 2023]  [67, 78, 65, 55]  2.224  


A. Basic Statistical Features
Length of lyrics (word count, character count)

Unique word count (lexical richness)

Average word length

Stopword ratio (percentage of common words like "the", "and")

In [15]:

# Load the MYCS file
df_mycs = pd.read_csv('song_mycs_scores.csv')

df_mycs['lyrics_cleaned'] = df_mycs['lyrics_cleaned'].astype(str)

df_mycs['word_count'] = df_mycs['lyrics_cleaned'].apply(lambda x: len(x.split()))
df_mycs['char_count'] = df_mycs['lyrics_cleaned'].apply(len)

print(df_mycs[['song', 'word_count', 'char_count', 'MYCS']].head())


                              song  word_count  char_count   MYCS
0                  Blinding Lights         261        1174  2.376
1                    how do i live         279        1212  2.256
2                             Stay         423        1790  2.244
3                             Stay         423        1790  2.244
4  All I Want for Christmas Is You         391        1843  2.224


In [16]:
#Unique word count (lexical richness)
from collections import Counter

In [17]:
from collections import Counter


# Unique word count: words that appear only once
df_mycs['unique_word_count'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: sum(1 for count in Counter(x.split()).values() if count == 1)
)

# Optional: Store the actual unique words
df_mycs['unique_words'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: [word for word, count in Counter(x.split()).items() if count == 1]
)

# Distinct word count: total number of different words used (vocabulary size)
df_mycs['distinct_word_count'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: len(set(x.split()))
)

# Preview the updated DataFrame
print(df_mycs[['song', 'unique_word_count', 'distinct_word_count', 'word_count', 'MYCS']].head())


                              song  unique_word_count  distinct_word_count  \
0                  Blinding Lights                 43                   96   
1                    how do i live                 27                   66   
2                             Stay                 42                   88   
3                             Stay                 42                   88   
4  All I Want for Christmas Is You                 61                  113   

   word_count   MYCS  
0         261  2.376  
1         279  2.256  
2         423  2.244  
3         423  2.244  
4         391  2.224  


In [18]:
import numpy as np


# Calculate average word length
df_mycs['avg_word_length'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: np.mean([len(word) for word in x.split()]) if x else 0
)

# Preview the result
print(df_mycs[['song', 'avg_word_length']].head(10))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


                              song  avg_word_length
0                  Blinding Lights         3.501916
1                    how do i live         3.340502
2                             Stay         3.234043
3                             Stay         3.234043
4  All I Want for Christmas Is You         3.716113
5                       Heat Waves         4.190955
6                        As It Was         3.529167
7                           Closer         3.985994
8                           Closer         3.985994
9                           smooth         3.788856


In [19]:
import nltk
from nltk.corpus import stopwords
import numpy as np

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Calculate stopword ratio for df_mycs
df_mycs['stopword_ratio'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: np.round(
        sum(1 for word in str(x).split() if word.lower() in stop_words) / max(len(str(x).split()), 1),
        2  # Round to 2 decimal places
    )
)

# Preview result
print(df_mycs[['song', 'stopword_ratio']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                              song  stopword_ratio
0                  Blinding Lights            0.43
1                    how do i live            0.56
2                             Stay            0.47
3                             Stay            0.47
4  All I Want for Christmas Is You            0.48


B. Linguistic & Sentiment Features
Sentiment Analysis (positive/negative sentiment score)

Emotion detection (anger, joy, sadness, etc.)

Lexical diversity (ratio of unique words to total words)


In [20]:
#Sentiment Analysis (positive/negative sentiment score)
from nltk.sentiment import SentimentIntensityAnalyzer
#Sentiment Analysis (positive/negative sentiment score)
nltk.download('vader_lexicon')

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment ratios
def sentiment_ratios(text):
    scores = sia.polarity_scores(str(text))  # Get sentiment scores
    total = scores['pos'] + scores['neg'] + scores['neu']  # Total sentiment score sum

    if total == 0:  # Avoid division by zero
        return (0, 0)

    pos_ratio = scores['pos'] / total  # Positive sentiment ratio
    neg_ratio = scores['neg'] / total  # Negative sentiment ratio
    neu_ratio = scores ['neu'] / total
    return pos_ratio, neg_ratio ,neu_ratio

# Apply function to each row
df_mycs[['pos_ratio', 'neg_ratio','neu_ratio']] = df_mycs['lyrics_cleaned'].apply(
    lambda x: pd.Series(sentiment_ratios(x))
)

print(df_mycs[['lyrics_cleaned', 'pos_ratio', 'neg_ratio', 'neu_ratio']].head())


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


                                      lyrics_cleaned  pos_ratio  neg_ratio  \
0  yeah ive been tryna call ive been on my own fo...      0.077      0.094   
1   how do i get through one night without you if...      0.039      0.071   
2  i do the same thing i told you that i never wo...      0.071      0.095   
3  i do the same thing i told you that i never wo...      0.071      0.095   
4  i dont want a lot for christmas there is just ...      0.103      0.086   

   neu_ratio  
0      0.829  
1      0.890  
2      0.834  
3      0.834  
4      0.811  


In [21]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

df_mycs['compound'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: sia.polarity_scores(str(x))['compound']
)
print(df_mycs[['lyrics_cleaned', 'compound']].head())

                                      lyrics_cleaned  compound
0  yeah ive been tryna call ive been on my own fo...   -0.1068
1   how do i get through one night without you if...   -0.1195
2  i do the same thing i told you that i never wo...   -0.8523
3  i do the same thing i told you that i never wo...   -0.8523
4  i dont want a lot for christmas there is just ...    0.9672


Syllable count

In [46]:
!pip install phonemizer

Collecting phonemizer
  Using cached phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
Collecting segments (from phonemizer)
  Using cached segments-2.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting dlinfo (from phonemizer)
  Using cached dlinfo-2.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting csvw>=1.5.6 (from segments->phonemizer)
  Using cached csvw-3.5.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting isodate (from csvw>=1.5.6->segments->phonemizer)
  Using cached isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting colorama (from csvw>=1.5.6->segments->phonemizer)
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Using cached phonemizer-3.3.0-py3-none-any.whl (103 kB)
Using cached dlinfo-2.0.0-py3-none-any.whl (3.7 kB)
Using cached segments-2.3.0-py2.py3-none-any.whl (15 kB)
Using cached csvw-3.5.1-py2.py3-none-any.whl (59 kB)
Using cached colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Using cached isodate-0.7.2-py3-none-any.whl (22 kB)
Installing c

In [23]:
!pip install syllapy

Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2


In [48]:
!pip install espeak-ng

[31mERROR: Could not find a version that satisfies the requirement espeak-ng (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for espeak-ng[0m[31m
[0m

In [24]:
import syllapy

# Function to count syllables in a text
def count_syllables(text):
    # Split text into words, count syllables for each word, and sum them
    return sum(syllapy.count(word) for word in text.split())

# Ensure 'lyrics_cleaned' is treated as a string
df_mycs['lyrics_cleaned'] = df_mycs['lyrics_cleaned'].astype(str)

# Calculate syllable count
df_mycs['syllable_count'] = df_mycs['lyrics_cleaned'].apply(count_syllables)

# Preview result
print(df_mycs[['song', 'syllable_count']].head())


                              song  syllable_count
0                  Blinding Lights             299
1                    how do i live             335
2                             Stay             467
3                             Stay             467
4  All I Want for Christmas Is You             473


In [26]:
!pip install pronouncing



In [29]:
import pronouncing


def rhyme_count(lyrics):
    words = str(lyrics).lower().split()
    if not words or lyrics is None or pd.isna(lyrics):
        return 0

    # Get unique words to avoid overcounting repeats
    unique_words = set(words)
    rhyme_pairs = 0

    # Check each word against others for rhymes
    for i, word1 in enumerate(sorted(unique_words)):
        rhymes = pronouncing.rhymes(word1)
        for word2 in sorted(unique_words)[i+1:]:

            if word2 in rhymes:
                rhyme_pairs += 1

    return rhyme_pairs

df_mycs['rhyme_pairs'] = df_mycs['lyrics_cleaned'].apply(rhyme_count)

# Preview the results
print("First 5 rows with rhyme_pairs:")
print(df_mycs[['lyrics_cleaned', 'rhyme_pairs']].head())


First 5 rows with rhyme_pairs:
                                      lyrics_cleaned  rhyme_pairs
0  yeah ive been tryna call ive been on my own fo...           50
1   how do i get through one night without you if...           23
2  i do the same thing i told you that i never wo...           42
3  i do the same thing i told you that i never wo...           42
4  i dont want a lot for christmas there is just ...           42
Correlation between MYCS and rhyme_pairs: 0.0040362425268622775


In [33]:
df_mycs['rhyme_density'] = df_mycs['rhyme_pairs'] / df_mycs['distinct_word_count']

# Handle cases where distinct_word_count is 0 to avoid division-by-zero
df_mycs['rhyme_density'] = df_mycs['rhyme_density'].fillna(0).replace([float('inf')], 0)

# Preview the results
print("First 5 rows with rhyme_density:")
print(df_mycs[['lyrics_cleaned', 'rhyme_pairs', 'distinct_word_count', 'rhyme_density']].head())

First 5 rows with rhyme_density:
                                      lyrics_cleaned  rhyme_pairs  \
0  yeah ive been tryna call ive been on my own fo...           50   
1   how do i get through one night without you if...           23   
2  i do the same thing i told you that i never wo...           42   
3  i do the same thing i told you that i never wo...           42   
4  i dont want a lot for christmas there is just ...           42   

   distinct_word_count  rhyme_density  
0                   96       0.520833  
1                   66       0.348485  
2                   88       0.477273  
3                   88       0.477273  
4                  113       0.371681  


Rhyme pattern

TOPICS?

what words correlate most with mycs?

In [34]:
print(df_mycs.columns)


Index(['song', 'artist', 'year', 'ranking', 'MYCS', 'lyrics_cleaned',
       'word_count', 'char_count', 'unique_word_count', 'unique_words',
       'distinct_word_count', 'avg_word_length', 'stopword_ratio', 'pos_ratio',
       'neg_ratio', 'neu_ratio', 'compound', 'syllable_count', 'rhyme_pairs',
       'rhyme_density'],
      dtype='object')
