In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("markkorvin/large-metal-lyrics-archive-228k-songs")

print("Path to dataset files:", path)

In [None]:
# Locate the file
import os

dataset_path = "/root/.cache/kagglehub/datasets/markkorvin/large-metal-lyrics-archive-228k-songs/versions/3"

# List all files in the folder
for file in os.listdir(dataset_path):
    print(file)

In [None]:
# Load dataset and check it
import pandas as pd

file_path = dataset_path + "/metal_lyrics.csv"
df = pd.read_csv(file_path)

df.head()

In [None]:
# Let's clean the dataset

# Remove punctuation and special characters
df.loc[:, 'lyrics_clean'] = df['Lyric'].str.replace(r'[^\w\s]', '', regex=True)

# Convert to lowercase
df.loc[:, 'lyrics_clean'] = df['lyrics_clean'].str.lower()

# Remove empty or short lyrics
df = df[(df['lyrics_clean'].notnull()) & (df['lyrics_clean'].str.len() > 30)].copy()

# Strip excess whitespace
df.loc[:, 'lyrics_clean'] = df['lyrics_clean'].str.strip()

In [None]:
# Now that the dataset is clean, we can do Sentiment Scoring with TextBlob

# Install the sentiment analysis library
!pip install textblob

# Load the class which will be used to analyze text
from textblob import TextBlob

# Create a sentiment scoring function
def get_sentiment(text):
  blob = TextBlob(text)
  return blob.sentiment.polarity

# Apply the function to the dataset
df['sentiment'] = df['lyrics_clean'].apply(get_sentiment)

# Explore the scores
df['sentiment'].describe()

In [None]:
# Let's build a histogram of Sentiment Scores
import matplotlib.pyplot as plt

# Set the figure size for better readability
plt.figure(figsize=(10, 6))

# Plot histogram for the 'sentiment' column
plt.hist(df['sentiment'], bins=50, color='darkred', edgecolor='black')

# Add labels and title
plt.title('Distribution of Sentiment in Metal Lyrics', fontsize=14)
plt.xlabel('Sentiment Score (-1 = Negative, +1 = Positive)', fontsize=12)
plt.ylabel('Number of Songs, fontsize=12')

# Show grid for clarity
plt.grid(axis='y', alpha=0.5)

# Display the plot
plt.show()

In [None]:
#Validate trends numerically
print(df['sentiment'].describe())

In [None]:
# Calculate average sentiment per artist
avg_sentiment_by_artist = df.groupby('Artist')['sentiment'].mean().sort_values()

print("\nTop 10 Artists by Average Negative Sentiment:")
print(avg_sentiment_by_artist.head(10))

print("\nTop 10 Artists by Average Positive Sentiment:")
print(avg_sentiment_by_artist.tail(10))

In [None]:
# Export dataset to CSV
df.to_csv('metal_lyrics_cleaned.csv', index=False)