# **Prepare data**


In [1]:
from google.colab import drive
import os

gdrive_path='/content/gdrive/MyDrive/Bertopic/shared_work/'
dataset_path = '/content/drive/MyDrive/Praktikum - NLP Applications/Datasets/genius_and_wasabi/'

# # This will mount your google drive under 'MyDrive'
# drive.mount('/content/gdrive', force_remount=True)
# # In order to access the files in this notebook we have to navigate to the correct folder
# os.chdir(gdrive_path)
# # Check manually if all files are present
# print(sorted(os.listdir()))


# To run from the common drive:
dataset_path = '/content/drive/MyDrive/Praktikum - NLP Applications/Datasets/genius_and_wasabi/'
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
pip install pandas bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [3]:
pip install pandas nltk



In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
!pip install --upgrade tensorflow


Collecting tensorflow
  Downloading tensorflow-2.15.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.15.0
    Uninstalling tensorflow-2.15.0:
      Successfully uninstalled tensorflow-2.15.0
Successfully installed tensorflow-2.15.0.post1


In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP

# set path of the file
csv_file_path = dataset_path + 'rock_genre_songs.csv'
df = pd.read_csv(csv_file_path)

# **Data Preprocessing**
This involves removing the explicit song structure from the lyrics column and initialising the CountVectorizer so that stop word removal is handled internally by BERTopic

In [7]:
# Removing explicit song structure as it is not important information
import re

def remove_explicit_song_structure(lyrics):
  pattern = r'\[.+?\]'
  cleaned_lyrics = re.sub(pattern, '', lyrics)
  return cleaned_lyrics

df['cleaned_lyrics'] = df['lyrics'].astype(str).apply(remove_explicit_song_structure)
df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,genre,artist,year,views,features,lyrics,language,gender,cleaned_lyrics
0,568,568,568,Knockout,rock,Lil Wayne,2010,66980,"{""Nicki Minaj""}",[Intro]\nJ.U.S.T.I.C.E. League\n\n[Verse 1: Li...,en,Male,"\nJ.U.S.T.I.C.E. League\n\n\nHey, Barbie, are-..."
1,783,783,783,Talk 2 Me,rock,Lil Wayne,2011,16363,{},[Verse 1: Lil Wayne]\n\nNow shorty lets get do...,en,Male,\n\nNow shorty lets get down to business\nIf y...
2,1065,1065,1065,Girls Forever,rock,Lil Wayne,2009,1036,{},"[Verse 1]\nShe said, ""Oh""\nShe said, ""Woo""\nI ...",en,Male,"\nShe said, ""Oh""\nShe said, ""Woo""\nI said, ""So..."
3,8465,8465,9698,Solar Midnite,rock,Lupe Fiasco,2009,4691,{},"[Verse 1]\nSimplified love-sick, taking no pri...",en,Male,"\nSimplified love-sick, taking no prisoners\nS..."
4,10063,10063,11412,From the very bottom of your tailbone,rock,Aesop Rock,2014,22,{},Practice at the same time keep your bottom rea...,en,Male,Practice at the same time keep your bottom rea...


In [8]:
# Initializing the CountVectorizer with English stop words to pass as a parameter to BERTopic
count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words="english")
# Initialising the UMAP constructor to set the random seed - this is so the results are reproducible

umap = UMAP(n_neighbors=15,
            n_components=5,
            min_dist=0.0,
            metric='cosine',
            low_memory=False,
            random_state=42)

# **Create Topics**
We select the "english" as the main language for our documents. If you want a multilingual model that supports 50+ languages, please select "multilingual" instead.

In [9]:
# Fitting and saving the BERTopic model
model = BERTopic(vectorizer_model=count_vectorizer, language="english", umap_model=umap) #nr_topics=50 parameters find the most similar topics and merge them
model_path = '/content/gdrive/MyDrive/Bertopic/shared_work/mymodel'
df['has_lyrics'] = ~df['lyrics'].isna() # flag the rows that have lyrics
documents = df[df['has_lyrics']]['cleaned_lyrics'].astype(str).tolist()
#topics, _ = model.fit_transform(documents)

# Fitting BERTopic
topic_model = model.fit(documents)
# Saving it using safetensors
# embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
# topic_model.save(model_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
# Saving it using safetensors
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
model.save(model_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

 If you are loading a pre-trained model, you should use transform() instead of fit_transform() to get the topics for new documents or fit_transform() if you are updating the model with new documents.

In [None]:
loaded_model = BERTopic.load("/content/gdrive/MyDrive/Bertopic/shared_work/mymodel")
topics, prob = loaded_model.transform(documents)
print(topics)

In [None]:
# # Save the topics using pickle
# import pickle
# with open('topics.pkl', 'wb') as topics_file:
#     pickle.dump(topics, topics_file)

# # Now you can load the topics from the file in future runs
# with open('topics.pkl', 'rb') as topics_file:
#     loaded_topics = pickle.load(topics_file)
# df.loc[df['has_lyrics'], 'topic'] = loaded_topics

***To measure gender bias per topic in a dataset using BERTopic and WEAT (Word Embedding Association Test), you need to follow a series of steps. These include topic modeling with BERTopic, creating target and attribute word sets for WEAT, and then performing the WEAT analysis for each topic. To integrate WEAT analysis with BERTopic, you need a separate word embeddings model.***

Step 1 Analyzing topic distribution across genders

In [None]:
# Checking what kind of labels are returned
topic_labels = loaded_model.generate_topic_labels()
topic_labels

In [None]:
# Visualizing the intertopic distance

loaded_model.visualize_topics()

In [None]:
# Visualizing the topic similarities

loaded_model.visualize_heatmap()

In [None]:
loaded_model.visualize_barchart()

we can also reduce the number of topics after having trained a BERTopic model. The advantage of doing so is that you can decide the number of topics after knowing how many are created. It is difficult to predict before training your model how many topics that are in your documents and how many will be extracted. Instead, we can decide afterward how many topics seem realistic:

In [None]:
def generate_topic_label_dictionary():
  topic_label_dictionary = {}
  topic_label_list = [{label.split('_')[0]: label.split('_', 1)[1]} for label in topic_labels]

  for item in topic_label_list:
    topic_label_dictionary.update(item)
  return topic_label_dictionary

topic_label_dictionary = generate_topic_label_dictionary()
print(topic_label_dictionary)

In [None]:
new_nr_topics = 5  # the new desired number of topics
loaded_model.reduce_topics(documents, nr_topics=new_nr_topics)
# merge the topic based on their similarity based on the distance between their centroids in the embedding space.

In [None]:
topics = loaded_model.topics_
len(topics)
topic_labels = loaded_model.generate_topic_labels()
topic_labels

In [None]:
#model = BERTopic(language="english")
df['has_lyrics'] = ~df['lyrics'].isna() # flag the rows that have lyrics
# Fit BERTopic
# documents = df[df['has_lyrics']]['cleaned_lyrics'].astype(str).tolist()
# topics, _ = model.fit_transform(documents) # you can obtain the embedding used for bertopic after fitting the model, NOTE: you can either do this or use embedding model directly
df.loc[~df['has_lyrics'], 'topic'] = "No Lyrics"  # e.g., -1 or "No Lyrics"
# Assign topics only to rows where 'has_lyrics' is True
df.loc[df['has_lyrics'], 'topic'] = topics

In [None]:
loaded_model.visualize_topics()

In [None]:
loaded_model.visualize_heatmap()

In [None]:
# Group by topics and gender, and count occurrences, aggregate lyrices by topic and gender
topic_gender_distribution = df.groupby(['topic', 'gender']).size().unstack(fill_value=0)

#  normalize the counts to compare proportions rather than raw counts
topic_gender_distribution_normalized = topic_gender_distribution.div(topic_gender_distribution.sum(axis=1), axis=0)

In [None]:
# Aggregate lyrics by topic and gender, groups the DataFrame by topic and gender and then concatenates all lyrics within each group.
aggregated_lyrics = df.groupby(['topic', 'gender'])['lyrics'].apply(lambda x: ' '.join(x)).reset_index()
# Extract embeddings for each group
embeddings = loaded_model._extract_embeddings(documents) # not recommended to use this method since it is an internal method and its use is not for standard operations, this functions also use internal sentence transformers

In [None]:
df['embedding'] = list(embeddings)

 Extract and Analyze Top Words per Topic-Gender Group
For each topic and gender group, extract the most representative words. These words will be used to measure bias.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_words(text, n=20):
    vec = CountVectorizer(stop_words='english').fit([text])
    bag_of_words = vec.transform([text])
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]
aggregated_lyrics['top_words'] = aggregated_lyrics['lyrics'].apply(lambda x: get_top_words(x))

In [None]:
pip install numpy gensim

In [None]:
pip install matplotlib seaborn

Cosine Similarity: This function computes the cosine similarity between two word embeddings.

Mean Embedding Similarity: This function computes the average cosine similarity between each word in a target set and an attribute set.

Differential Association: This calculates the WEAT score, which is the differential association between two sets of target words and two sets of attribute words.

WEAT Effect Size: This calculates the effect size, a measure of how large the difference in associations is.

Target and Attribute Sets: You need to define these sets based on your specific analysis goal.

Target words are typically chosen to represent two different groups that you want to compare for bias. WEAT calculates how strongly each set of target words is associated with each set of attribute words. If one set of target words is more closely associated with positive attribute words (e.g., 'joy', 'peace', 'love') than the other, this might indicate a bias in the embedding space.Quantitative Analysis: The strength of these associations is quantified using cosine similarity in the embedding space. This provides a numerical measure of bias, which is the WEAT score.

In [None]:
!wget -c "http://nlp.stanford.edu/data/glove.6B.zip"
!unzip glove.6B.zip
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.6B.100d.txt'  # Adjust the file name as needed
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

if you want to display the topics as words instead of numeric labels in your WEAT analysis. you need to map the numeric topic labels to their corresponding words. You can achieve this by creating a mapping dictionary that associates each topic label with a list of words representing that topic.

In [None]:
from gensim.models import KeyedVectors

word_embeddings_model = KeyedVectors.load_word2vec_format('glove.6B.100d.txt.word2vec', binary=False)

target set, but in the literature this should be the attribute set, will correctly later after full dev of code

In [None]:
male_words = ['he', 'him', 'his', 'father', 'papa', 'dad', 'son', 'uncle', 'grandfather', 'grandpa', 'man', 'male', 'brother', 'husband', 'boyfriend', 'sir', 'king', 'guy', 'father-in-law', 'son-in-law', 'nephew', 'boy']
female_words = ['she', 'her', 'hers', 'mother', 'mama', 'daughter', 'aunt', 'auntie', 'grandmother', 'woman', 'female', 'sister', 'mom', 'wife', 'girlfriend', 'madam', 'queen', 'gal', 'niece', 'grandmother-in-law', 'daughter-in-law', 'lady', 'miss', 'sis', 'girl']
other_attribute_words = ['they', 'them', 'their', 'person', 'individual', 'someone', 'other', 'human', 'somebody', 'citizen']

#This variable is supposed to represent another set of attribute words for the WEAT analysis, serving as a basis for comparison against the attribute words extracted from each topic.
male_words = [word for word in male_words if word in word_embeddings_model.key_to_index]
female_words = [word for word in female_words if word in word_embeddings_model.key_to_index]
other_attribute_words = [word for word in other_attribute_words if word in word_embeddings_model.key_to_index]

In [None]:
import numpy as np
from gensim.models import KeyedVectors

def cosine_similarity(embedding1, embedding2):
    norm1 = np.linalg.norm(embedding1)
    norm2 = np.linalg.norm(embedding2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return np.dot(embedding1, embedding2) / (norm1 * norm2)

def mean_embedding_similarity(target_set, attribute_set, embeddings_model):
    total_similarity = 0
    count = 0

    for target_word in target_set:
        if target_word in embeddings_model.key_to_index:
            target_embedding = embeddings_model[target_word]
            for attribute_word in attribute_set:
                if attribute_word in embeddings_model.key_to_index:
                    attribute_embedding = embeddings_model[attribute_word]
                    total_similarity += cosine_similarity(target_embedding, attribute_embedding)
                    count += 1

    return total_similarity / count if count > 0 else 0

def sc_weat_effect_size(target_set, attribute_set_1, attribute_set_2, embeddings_model):
    attribute_set_1_avg_embedding = average_embedding(attribute_set_1, embeddings_model)
    attribute_set_2_avg_embedding = average_embedding(attribute_set_2, embeddings_model)

    target_set_embeddings = [embeddings_model[word] for word in target_set if word in embeddings_model.key_to_index]

    differences = [cosine_similarity(word_embedding, attribute_set_1_avg_embedding) - cosine_similarity(word_embedding, attribute_set_2_avg_embedding) for word_embedding in target_set_embeddings]

    mean_diff = np.mean(differences)
    std_dev = np.std(differences)

    return mean_diff / std_dev

def average_embedding(attribute_set, embeddings_model):
    embeddings = [embeddings_model[word] for word in attribute_set if word in embeddings_model.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embeddings_model.vector_size)

def flatten(lst):
    return [item for sublist in lst for item in sublist]

def print_sc_weat_results(topic, association, score):
    print(f"Topic: {topic}")
    print(f"  SSWEAT Score: {score}")
    print(f"  Association: {association}")
    print("")

# Define your target and attribute word sets
male_words = [word for word in male_words if word in word_embeddings_model.key_to_index]
other_attribute_words = [word for word in other_attribute_words if word in word_embeddings_model.key_to_index]

# Create a mapping dictionary to associate topic labels with words
topic_words_mapping = {}
sc_weat_results = []

for topic_label in aggregated_lyrics['topic'].unique():
    # Logic to obtain top words for each topic
    top_words = aggregated_lyrics[(aggregated_lyrics['topic'] == topic_label)]['top_words'].tolist()
    top_words = flatten(top_words)
    top_words_cleared = [item[0] for item in top_words]
    topic_words_mapping[topic_label] = top_words_cleared

    attribute_set = top_words_cleared
    if attribute_set:
        effect_size = sc_weat_effect_size(male_words, attribute_set, other_attribute_words, word_embeddings_model)
        association = mean_embedding_similarity(male_words, attribute_set, word_embeddings_model)

        print_sc_weat_results(topic_label_dictionary[str(topic_label)], association, effect_size)
        sc_weat_results.append({
            "topic_number": str(topic_label),
            "topic_label": topic_label,
            "effect_size": effect_size,
            "association": association,
        })
    else:
        print(f"Topic {topic} - Not enough data for SC-WEAT analysis")



In [None]:
weat_results

In [None]:
# Storing WEAT results

weat_results_df = pd.DataFrame(weat_results)
weat_results_df.to_csv(model_path + '/weat_results_for_rock_genre.csv')