# **Prepare data**


In [None]:
from google.colab import drive
import os

# Following snippet to be used with gdrive:
# gdrive_path='/content/gdrive/MyDrive/Bertopic/shared_work/'

# # This will mount your google drive under 'MyDrive'
# drive.mount('/content/gdrive', force_remount=True)
# # In order to access the files in this notebook we have to navigate to the correct folder
# os.chdir(gdrive_path)
# # Check manually if all files are present
# print(sorted(os.listdir()))


# Use this when there's no gdrive:

dataset_path = '/content/drive/MyDrive/Praktikum - NLP Applications/Datasets/genius_and_wasabi/concatenated_chunks.csv'

# This will mount your google drive under 'MyDrive'
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Common Drive instructions:

!pip install pandas numpy bertopic
!pip install gensim nltk matplotlib seaborn

In [None]:
pip install pandas bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m704.2 kB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━

In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip install --upgrade tensorflow


Collecting tensorflow
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow)
  Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.16,>=2.15.0 (from tensorflow)
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting google-auth-oauthlib

In [None]:
!pip install umap-learn



In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP

# set path of the file
csv_file_path = 'concatenated_chunks.csv'
# df = pd.read_csv(csv_file_path) # -> Uncomment for gdrive
df = pd.read_csv(dataset_path)

documents = df['lyrics'].tolist()  # Convert the text column to a list

# **Data Preprocessing**
This involves removing the explicit song structure from the lyrics column and initialising the CountVectorizer so that stop word removal is handled internally by BERTopic

In [None]:
# Removing explicit song structure as it is not important information
import re

def remove_explicit_song_structure(lyrics):
  pattern = r'\[.+?\]'
  cleaned_lyrics = re.sub(pattern, '', lyrics)
  return cleaned_lyrics

df['cleaned_lyrics'] = df['lyrics'].astype(str).apply(remove_explicit_song_structure)
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,genre,artist,year,views,features,lyrics,language,gender,cleaned_lyrics
0,0,0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",en,Male,"\nKilla Cam, Killa Cam, Cam\nKilla Cam, Killa ..."
1,1,1,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,en,Male,"\n\n\nUgh, Killa!\nBaby!\nKanye, this that 197..."
2,2,2,Family Ties,rap,Cam'ron,2004,41960,"{""Cam\\'ron"",""Lady Wray""}","[Verse 1: Cam'ron]\nKilla, Dipset\nMan I spit ...",en,Male,"\nKilla, Dipset\nMan I spit that pimp talk, yo..."
3,3,3,Rockin and Rollin,rap,Cam'ron,1998,6399,"{""Cam\\'ron""}",[Verse 1]\nAy yo you wonder who I are\nI guzzl...,en,Male,\nAy yo you wonder who I are\nI guzzle up at t...
4,4,4,Lord You Know,rap,Cam'ron,2004,11882,"{""Cam\\'ron"",""Juelz Santana"",Jaheim}","[Chorus: Jaheim]\nNow Lord you know, just how ...",en,Male,"\nNow Lord you know, just how hard I try\nTo l..."


In [None]:
# Initializing the CountVectorizer with English stop words to pass as a parameter to BERTopic
count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words="english")
# Initialising the UMAP constructor to set the random seed - this is so the results are reproducible

umap = UMAP(n_neighbors=15,
            n_components=5,
            min_dist=0.0,
            metric='cosine',
            low_memory=False,
            random_state=42)

# **Create Topics**
We select the "english" as the main language for our documents. If you want a multilingual model that supports 50+ languages, please select "multilingual" instead.

In [None]:
# Check if there are missing lyrics
df['cleaned_lyrics'].isna().sum()

0

In [None]:
# Fitting and saving the BERTopic model

model = BERTopic(vectorizer_model=count_vectorizer, language="english", umap_model=umap)
model_path = '/content/drive/MyDrive/Praktikum - NLP Applications/Models/bertopic_initial'
documents = df['cleaned_lyrics'].astype(str).tolist()

# Fitting BERTopic
topic_model = model.fit(documents)a
# Saving it using safetensors
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save(model_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [None]:
model = BERTopic(vectorizer_model=count_vectorizer, language="english", umap_model=umap)
# df['has_lyrics'] = ~df['lyrics'].isna() # flag the rows that have lyrics - Not needed as all rows have lyrics
# Fit BERTopic
documents = df['cleaned_lyrics'].astype(str).tolist()
topics, _ = model.fit_transform(documents) # you can obtain the embedding used for bertopic after fitting the model, NOTE: you can either do this or use embedding model directly
# df.loc[~df['has_lyrics'], 'topic'] = "No Lyrics"  # e.g., -1 or "No Lyrics"
# Assign topics only to rows where 'has_lyrics' is True
df.loc[df['cleaned_lyrics'], 'topic'] = topics

NameError: ignored

In [None]:
# Save the topics using pickle
import pickle
with open('topics.pkl', 'wb') as topics_file:
    pickle.dump(topics, topics_file)

# Now you can load the topics from the file in future runs
# with open('topics.pkl', 'rb') as topics_file:
#     topics = pickle.load(topics_file)
# df.loc[df['cleaned_lyrics'], 'topic'] = loaded_topics

***To measure gender bias per topic in a dataset using BERTopic and WEAT (Word Embedding Association Test), you need to follow a series of steps. These include topic modeling with BERTopic, creating target and attribute word sets for WEAT, and then performing the WEAT analysis for each topic. To integrate WEAT analysis with BERTopic, you need a separate word embeddings model.***

Step 1 Analyzing topic distribution across genders

In [None]:
# Group by topics and gender, and count occurrences, aggregate lyrices by topic and gender
topic_gender_distribution = df.groupby(['topic', 'gender']).size().unstack(fill_value=0)

#  normalize the counts to compare proportions rather than raw counts
topic_gender_distribution_normalized = topic_gender_distribution.div(topic_gender_distribution.sum(axis=1), axis=0)

In [None]:
# Aggregate lyrics by topic and gender, groups the DataFrame by topic and gender and then concatenates all lyrics within each group.
aggregated_lyrics = df.groupby(['topic', 'gender'])['lyrics'].apply(lambda x: ' '.join(x)).reset_index()
# Extract embeddings for each group
embeddings = model._extract_embeddings(documents) # not recommended to use this method since it is an internal method and its use is not for standard operations

In [None]:
df['embedding'] = list(embeddings)

 Extract and Analyze Top Words per Topic-Gender Group
For each topic and gender group, extract the most representative words. These words will be used to measure bias.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_words(text, n=20):
    vec = CountVectorizer(stop_words='english').fit([text])
    bag_of_words = vec.transform([text])
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]
aggregated_lyrics['top_words'] = aggregated_lyrics['lyrics'].apply(lambda x: get_top_words(x))

In [None]:
!pip install numpy gensim

In [None]:
!pip install matplotlib seaborn

Cosine Similarity: This function computes the cosine similarity between two word embeddings.

Mean Embedding Similarity: This function computes the average cosine similarity between each word in a target set and an attribute set.

Differential Association: This calculates the WEAT score, which is the differential association between two sets of target words and two sets of attribute words.

WEAT Effect Size: This calculates the effect size, a measure of how large the difference in associations is.

Target and Attribute Sets: You need to define these sets based on your specific analysis goal.

In [None]:
# manually define weat since the lib did not work
import numpy as np
from gensim.models import KeyedVectors
def cosine_similarity(embedding1, embedding2):
    # print(embedding1)
    # print(embedding2) # problem
    norm1 = np.linalg.norm(embedding1)
    norm2 = np.linalg.norm(embedding2)
    if norm1 == 0 or norm2 == 0:
        return 0  # Return 0 if either vector is a zero-vector
    return np.dot(embedding1, embedding2) / (norm1 * norm2)

def mean_embedding_similarity(target_set, attribute_set, embeddings_model):
    total_similarity = 0
    count = 0

    for target_word in target_set:
        if target_word in embeddings_model.key_to_index:
            target_embedding = embeddings_model[target_word]
            for attribute_word in attribute_set:
                if attribute_word in embeddings_model.key_to_index:
                    attribute_embedding = embeddings_model[attribute_word]
                    total_similarity += cosine_similarity(target_embedding, attribute_embedding)
                    count += 1

    return total_similarity / count if count > 0 else 0


def differential_association(target_set_1, target_set_2, attribute_set_1, attribute_set_2, embeddings_model):
    return (mean_embedding_similarity(target_set_1, attribute_set_1, embeddings_model) -
            mean_embedding_similarity(target_set_1, attribute_set_2, embeddings_model)) - (
            mean_embedding_similarity(target_set_2, attribute_set_1, embeddings_model) -
            mean_embedding_similarity(target_set_2, attribute_set_2, embeddings_model))


def average_embedding(attribute_set, embeddings_model):
    embeddings = [embeddings_model[word] for word in attribute_set if word in embeddings_model.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embeddings_model.vector_size)  # Return zero vector if no embeddings

def weat_effect_size(target_set_1, target_set_2, attribute_set_1, attribute_set_2, embeddings_model):
    attribute_set_1_avg_embedding = average_embedding(attribute_set_1, embeddings_model)
    attribute_set_2_avg_embedding = average_embedding(attribute_set_2, embeddings_model)

    target_set_1_embeddings = [embeddings_model[word] for word in target_set_1 if word in embeddings_model.key_to_index]
    target_set_2_embeddings = [embeddings_model[word] for word in target_set_2 if word in embeddings_model.key_to_index]

    # Calculate differences for target_set_1, in our case male words
    for word_embedding in target_set_1_embeddings:
        diff = cosine_similarity(word_embedding, attribute_set_1_avg_embedding) - cosine_similarity(word_embedding, attribute_set_2_avg_embedding)
        print(f"Word embedding diff for target_set_1: {diff}")

    # Calculate differences for target_set_2, in our case female words
    for word_embedding in target_set_2_embeddings:
        diff = cosine_similarity(word_embedding, attribute_set_1_avg_embedding) - cosine_similarity(word_embedding, attribute_set_2_avg_embedding)
        print(f"Word embedding diff for target_set_2: {diff}")

    mean_diff_1 = np.mean([cosine_similarity(word_embedding, attribute_set_1_avg_embedding) - cosine_similarity(word_embedding, attribute_set_2_avg_embedding)
                           for word_embedding in target_set_1_embeddings])
    mean_diff_2 = np.mean([cosine_similarity(word_embedding, attribute_set_1_avg_embedding) - cosine_similarity(word_embedding, attribute_set_2_avg_embedding)
                           for word_embedding in target_set_2_embeddings])

    all_embeddings = np.concatenate([target_set_1_embeddings, target_set_2_embeddings])
    std_dev = np.std([cosine_similarity(word_embedding, attribute_set_1_avg_embedding) - cosine_similarity(word_embedding, attribute_set_2_avg_embedding)
                      for word_embedding in all_embeddings])

    return (mean_diff_1 - mean_diff_2) / std_dev


Target words are typically chosen to represent two different groups that you want to compare for bias. WEAT calculates how strongly each set of target words is associated with each set of attribute words. If one set of target words is more closely associated with positive attribute words (e.g., 'joy', 'peace', 'love') than the other, this might indicate a bias in the embedding space.Quantitative Analysis: The strength of these associations is quantified using cosine similarity in the embedding space. This provides a numerical measure of bias, which is the WEAT score.

In [None]:
!wget -c "http://nlp.stanford.edu/data/glove.6B.zip"
!unzip glove.6B.zip
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.6B.100d.txt'  # Adjust the file name as needed
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
word_embeddings_model = KeyedVectors.load_word2vec_format('glove.6B.100d.txt.word2vec', binary=False)

In [None]:
# male_words = ['he', 'him', 'his']   # Add more male-associated words
# female_words = ['she', 'her', 'hers']  # Add more female-associated words
# other_attribute_words = ['word1', 'word2', 'word3']  # Add neutral/contrasting words

male_words = ['he', 'him', 'his', 'father', 'papa', 'dad', 'son', 'uncle', 'grandfather', 'grandpa', 'man', 'male', 'brother', 'husband', 'boyfriend', 'sir', 'king', 'guy', 'father-in-law', 'son-in-law', 'nephew', 'boy']
female_words = ['she', 'her', 'hers', 'mother', 'mama', 'daughter', 'aunt', 'auntie', 'grandmother', 'woman', 'female', 'sister', 'mom', 'wife', 'girlfriend', 'madam', 'queen', 'gal', 'niece', 'grandmother-in-law', 'daughter-in-law', 'lady', 'miss', 'sis', 'girl']
other_attribute_words = ['they', 'them', 'their', 'person', 'individual', 'someone', 'other', 'human', 'somebody', 'citizen']

#This variable is supposed to represent another set of attribute words for the WEAT analysis, serving as a basis for comparison against the attribute words extracted from each topic.
male_words = [word for word in male_words if word in word_embeddings_model.key_to_index]
female_words = [word for word in female_words if word in word_embeddings_model.key_to_index]
other_attribute_words = [word for word in other_attribute_words if word in word_embeddings_model.key_to_index]

if you want to display the topics as words instead of numeric labels in your WEAT analysis. you need to map the numeric topic labels to their corresponding words. You can achieve this by creating a mapping dictionary that associates each topic label with a list of words representing that topic.

In [None]:
def flatten(lst):
    return [item for sublist in lst for item in sublist]

# Function to print the results in an organized manner
def print_weat_results(topic, male_association, female_association, weat_score, effect_size):
    print(f"Topic: {topic}")
    print(f"  WEAT Score: {weat_score}")
    print(f"  Effect Size: {effect_size}")
    print(f"  Male Association: {male_association}")
    print(f"  Female Association: {female_association}")
    print("")

# Create a mapping dictionary to associate topic labels with words
topic_words_mapping = {}  # Initialize an empty dictionary

for topic_label in aggregated_lyrics['topic'].unique():
    # You can use your existing logic to obtain top words for each topic
    top_words = aggregated_lyrics[(aggregated_lyrics['topic'] == topic_label)]['top_words'].tolist()
    top_words = flatten(top_words)
    top_words_cleared = [item[0] for item in top_words]
    topic_words_mapping[topic_label] = top_words_cleared

# Compute and print WEAT score and effect size for each topic
for topic in aggregated_lyrics['topic'].unique():
    attribute_set = []
    for gender in ['Male', 'Female']:
        top_words = aggregated_lyrics[(aggregated_lyrics['topic'] == topic) &
                                      (aggregated_lyrics['gender'] == gender)]['top_words'].tolist()
        top_words = flatten(top_words)
        top_words_cleared = [item[0] for item in top_words]
        attribute_set.extend(top_words_cleared)

    if attribute_set:
        weat_score = differential_association(male_words, female_words, attribute_set, other_attribute_words, word_embeddings_model)
        effect_size = weat_effect_size(male_words, female_words, attribute_set, other_attribute_words, word_embeddings_model)
        #The function's purpose is to quantify potential biases in word embeddings. Specifically, it measures how much more
        # strongly one set of target words (e.g., male_words) is associated with a certain topic's words (attribute_set)
        # compared to another set of target words (e.g., female_words), and vice versa.

        # Comparing Male and Female Words with Topic Words (attribute_set):

        # The function calculates how strongly words related to males and females are associated with words from a specific topic in the lyrics.
        # This is done by computing the cosine similarity between the target words' embeddings and the average embedding of the topic's words.
        # Comparing Male and Female Words with Other Attribute Words (other_attribute_words):

        # Similarly, the function assesses the association between the male and female words with another set of attribute words, which serves as a comparison or control group.
        # Effect Size Calculation:

        # The effect size is computed to quantify the difference in association strengths. A larger effect size suggests a more pronounced bias,
        #  indicating that one set of target words (either male_words or female_words) has a stronger association with the topic words compared to the other set.

        male_association = mean_embedding_similarity(male_words, attribute_set, word_embeddings_model)
        female_association = mean_embedding_similarity(female_words, attribute_set, word_embeddings_model)
        # Print the topic as words instead of the numeric label
        topic_words = ', '.join(topic_words_mapping.get(topic_label, []))
        print_weat_results(topic_words, male_association, female_association, weat_score, effect_size)
    else:
        print(f"Topic {topic} - Not enough data for WEAT analysis")

In [None]:
!pip install torch
import torch

In [None]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/Bertopic/shared_work/model_checkpoint.pth')

In [None]:
model.load_state_dict(torch.load('/content/gdrive/MyDrive/Bertopic/shared_work/model_checkpoint.pth'))