In [None]:
!pip install bertopic sentence-transformers umap-learn hdbscan spacy nltk
!python -m spacy download en_core_web_sm

In [None]:
pip install memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test_wikipedia_metadata_updated.csv')  # Update path


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


NameError: name 'pd' is not defined

In [None]:
import pandas as pd
import numpy as np
import random
import string
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from bertopic import BERTopic
from umap import UMAP
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Set seed for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# NLTK setup
nltk.download('punkt')
nltk.download('stopwords')

# Extended stop words list
stop_words = set(stopwords.words('english'))
stop_words.update([
    "'s", "'ve", "'re", "'m", "'d", "'ll", "ca", "wo", "ill", "don", "doesn", "isn",
    "wan", "gon", "na", "gonna", "wanna", "ya", "im", "cause", "dont", "goin",
    "yeah", "uh", "oh", "ah", "yo", "ooh", "woo", "uhh", "hey", "like",
    "be", "make", "come", "tell", "put", "take", "go", "let", "give",
    "know", "want", "think", "find", "try", "live", "move",
    "stand", "run", "show", "call", "watch", "hear", "walk",
    "break", "prove", "use", "drive", "turn", "say",
    "really", "well", "good", "even", "still", "little", "something",
    "nothing", "much", "true", "bad", "big", "sweet", "fine", "nice",
    "pretty", "hard", "free", "open", "glad",
    "out", "in", "on", "off", "around", "through", "there", "here",
    "thing", "way", "world", "baby", "name", "heart", "day", "night",
    "fire", "money", "people", "place", "word", "friend",
    "town", "color", "wind", "lady", "guy", "time",
    "chorus", "round", "lovin", "swing", "radio", "boogie", "next",
    "train", "minute",
    "always", "never", "ever", "long", "old", "young", "past", "present", "future", "sometimes",
    "two", "three", "four", "five"
])

# SpaCy model loading
try:
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
except OSError:
    print("Downloading SpaCy model 'en_core_web_sm'...")
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# --- Helper functions ---

def clean_text(text):
    """
    Cleans the input text by lowercasing, removing punctuation,
    tokenizing, and removing stopwords and non-alphabetic tokens.
    """
    if pd.isna(text):
        return []
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words and word.isalpha()]

def lemmatize(tokens_list):
    """
    Lemmatizes a list of tokenized documents, keeping specified POS tags.
    """
    results = []
    # Define allowed POS tags for lemmatization
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    for tokens in tqdm(tokens_list, desc="Lemmatizing documents"):
        # Ensure tokens is a list of strings before joining
        if not isinstance(tokens, list) or not all(isinstance(word, str) for word in tokens):
            results.append([])
            continue
        doc = nlp(" ".join(tokens))
        results.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return results

def assign_generation(year):
    """
    Assigns a generational label based on the release year.
    """
    try:
        year = int(year)
        if 1965 <= year <= 1980:
            return "Gen X"
        elif 1981 <= year <= 2000:
            return "Millennial"
        elif 2001 <= year <= 2010:
            return "Gen Z"
        elif year >= 2011:
            return "Gen Alpha"
        else:
            return "Other" # For years outside defined ranges
    except (ValueError, TypeError):
        return "Unknown" # Handle non-numeric or missing years

# --- Load your CSV file ---
# Update this path to your actual file location
FILE_PATH = "/content/drive/MyDrive/test_wikipedia_metadata.csv"

print("--- Starting BERTopic Topic Modeling with Coherence Scores ---")

try:
    df = pd.read_csv(FILE_PATH)
    print(f"Data loaded from: {FILE_PATH}")
except Exception as e:
    print(f"Failed to load file: {e}")
    raise e

# Validate required columns
if 'Cleaned_Lyrics' not in df.columns or 'Release Year' not in df.columns:
    raise ValueError("Required columns 'Cleaned_Lyrics' or 'Release Year' missing in the DataFrame.")

# Assign generation to each entry
df['Generation'] = df['Release Year'].apply(assign_generation)

# Clean and lemmatize lyrics
print("Preprocessing text...")
df["Processed"] = df["Cleaned_Lyrics"].apply(clean_text)
df["Lemmatized"] = lemmatize(df["Processed"])

# Drop rows where lemmatization resulted in empty lists (no useful tokens)
df = df[df["Lemmatized"].str.len() > 0].copy() # Use .copy() to avoid SettingWithCopyWarning

# --- Setup BERTopic Model ---
# UMAP model for dimensionality reduction, with fixed random_state for reproducibility
umap_model = UMAP(n_neighbors=10, n_components=3, metric='cosine', low_memory=True, random_state=SEED)

# BERTopic model setup
topic_model = BERTopic(
    umap_model=umap_model,
    calculate_probabilities=False,
    verbose=True
)

# --- Apply BERTopic per Generation ---
generations = df['Generation'].unique()

for gen in sorted(generations): # Sort generations for consistent output order
    # --- IMPORTANT CHANGE: Skip 'Other' and 'Unknown' generations ---
    if gen in ['Other', 'Unknown']:
        print(f"\n--- Skipping Generation: {gen} as requested ---")
        continue

    print(f"\n--- Processing Generation: {gen} ---")
    gen_df = df[df['Generation'] == gen].copy()

    # Prepare documents for BERTopic (join lemmatized tokens back into strings)
    docs = [" ".join(tokens) for tokens in gen_df["Lemmatized"].tolist()]

    if not docs:
        print(f"No documents available for {gen} after preprocessing. Skipping.")
        continue

    try:
        # Fit BERTopic model
        topics, probs = topic_model.fit_transform(docs)
        print(f"Model built for {gen}")

        # Get topic info and count (excluding -1 outlier topic)
        topic_info = topic_model.get_topic_info()
        num_valid_topics = len(topic_info[topic_info['Topic'] != -1])
        print(f"Number of topics: {num_valid_topics}")

        # Display top topics
        print(f"\nTop 5 topics for {gen}:")
        # Ensure we only display up to 5 topics, and exclude the -1 topic
        valid_topics_display = topic_info[topic_info['Topic'] != -1].head(5)
        print(valid_topics_display[['Topic', 'Count', 'Name']])

        # Prepare topics for coherence calculation
        # Get all valid topic IDs, excluding the -1 outlier topic
        coherence_topic_ids = topic_info[topic_info['Topic'] != -1]['Topic'].tolist()

        # Get the top words for each topic in the format required by CoherenceModel
        # This will be a list of lists of words for each topic
        bertopic_topics_words = []
        for topic_id in coherence_topic_ids:
            # get_topic returns a list of (word, probability) tuples
            words_in_topic = [word for word, _ in topic_model.get_topic(topic_id)]
            bertopic_topics_words.append(words_in_topic)

        # Prepare tokenized documents for coherence calculation
        # This is the list of lists of tokens that BERTopic was trained on (before joining to strings)
        tokenized_docs_for_coherence = gen_df["Lemmatized"].tolist()

        # Create a Gensim Dictionary from the tokenized documents
        dictionary = Dictionary(tokenized_docs_for_coherence)

        # Calculate Coherence Score (c_v)
        # Ensure we have topics and documents to calculate coherence for
        if bertopic_topics_words and tokenized_docs_for_coherence:
            coherence_model = CoherenceModel(
                topics=bertopic_topics_words,
                texts=tokenized_docs_for_coherence,
                dictionary=dictionary,
                coherence='c_v'
            )
            coherence_score = coherence_model.get_coherence()
            print(f"\nCoherence Score (c_v) for {gen}: {coherence_score:.4f}")
        else:
            print(f"Not enough topics or documents for coherence calculation for {gen}.")

    except Exception as e:
        print(f"Error processing {gen}: {e}")
        continue

print("\n--- BERTopic Analysis Complete with Coherence Scores ---")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


--- Starting BERTopic Topic Modeling with Coherence Scores ---
Data loaded from: /content/drive/MyDrive/test_wikipedia_metadata.csv
Preprocessing text...


Lemmatizing documents: 100%|██████████| 11578/11578 [02:09<00:00, 89.21it/s]
2025-06-12 14:25:59,373 - BERTopic - Embedding - Transforming documents to embeddings.



--- Processing Generation: Gen Alpha ---


Batches:   0%|          | 0/82 [00:00<?, ?it/s]

2025-06-12 14:28:24,465 - BERTopic - Embedding - Completed ✓
2025-06-12 14:28:24,467 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-12 14:28:36,684 - BERTopic - Dimensionality - Completed ✓
2025-06-12 14:28:36,686 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-12 14:28:36,871 - BERTopic - Cluster - Completed ✓
2025-06-12 14:28:36,880 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-12 14:28:37,302 - BERTopic - Representation - Completed ✓


Model built for Gen Alpha
Number of topics: 2

Top 5 topics for Gen Alpha:
   Topic  Count                   Name
1      0   2054     0_love_get_re_feel
2      1     24  1_dream_ve_sleep_away


2025-06-12 14:28:38,207 - BERTopic - Embedding - Transforming documents to embeddings.



Coherence Score (c_v) for Gen Alpha: 0.4241

--- Processing Generation: Gen X ---


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

2025-06-12 14:29:07,505 - BERTopic - Embedding - Completed ✓
2025-06-12 14:29:07,509 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-12 14:29:09,384 - BERTopic - Dimensionality - Completed ✓
2025-06-12 14:29:09,385 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-12 14:29:09,414 - BERTopic - Cluster - Completed ✓
2025-06-12 14:29:09,419 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-12 14:29:09,498 - BERTopic - Representation - Completed ✓


Model built for Gen X
Number of topics: 7

Top 5 topics for Gen X:
   Topic  Count                       Name
1      0    122        0_get_dance_re_look
2      1     70      1_love_get_leave_come
3      2     22  2_love_lose_get_celebrate
4      3     19    3_love_need_woman_thank
5      4     17     4_feel_love_guilt_hurt

Coherence Score (c_v) for Gen X: 0.3300

--- Processing Generation: Gen Z ---


2025-06-12 14:29:09,835 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/132 [00:00<?, ?it/s]

2025-06-12 14:32:14,531 - BERTopic - Embedding - Completed ✓
2025-06-12 14:32:14,534 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-12 14:32:23,250 - BERTopic - Dimensionality - Completed ✓
2025-06-12 14:32:23,255 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-12 14:32:23,434 - BERTopic - Cluster - Completed ✓
2025-06-12 14:32:23,440 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-12 14:32:23,785 - BERTopic - Representation - Completed ✓


Model built for Gen Z
Number of topics: 2

Top 5 topics for Gen Z:
   Topic  Count                   Name
1      0   4056       0_love_get_re_ve
2      1     17  1_hold_re_tight_piece


2025-06-12 14:32:24,794 - BERTopic - Embedding - Transforming documents to embeddings.



Coherence Score (c_v) for Gen Z: 0.3571

--- Processing Generation: Millennial ---


Batches:   0%|          | 0/68 [00:00<?, ?it/s]

2025-06-12 14:33:55,549 - BERTopic - Embedding - Completed ✓
2025-06-12 14:33:55,551 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-12 14:34:05,519 - BERTopic - Dimensionality - Completed ✓
2025-06-12 14:34:05,521 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-12 14:34:05,602 - BERTopic - Cluster - Completed ✓
2025-06-12 14:34:05,608 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-12 14:34:05,790 - BERTopic - Representation - Completed ✓


Model built for Millennial
Number of topics: 3

Top 5 topics for Millennial:
   Topic  Count                        Name
1      0   1682            0_love_get_re_ve
2      1     17  1_dance_let_strike_feeling
3      2     12         2_get_love_tha_game

Coherence Score (c_v) for Millennial: 0.3492

--- Skipping Generation: Other as requested ---

--- Skipping Generation: Unknown as requested ---

--- BERTopic Analysis Complete with Coherence Scores ---


In [None]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 