WEAT Analysis for Bias Measurements of Lyrics

Dictionary of attribute and target sets obtained from: https://github.com/Loreb92/sexism_and_bias_in_song_lyrics/blob/main/

Path to dictionary of words: data/Data_WEAT/weat_attrib_target_3.json

In [None]:
from google.colab import drive
import os

  # gdrive_path='/content/gdrive/MyDrive/Bertopic/shared_work/'

  # # This will mount your google drive under 'MyDrive'
# drive.mount('/content/gdrive', force_remount=True)
# # In order to access the files in this notebook we have to navigate to the correct folder
# os.chdir(gdrive_path)
# dataset_path = ''
# # Check manually if all files are present
# print(sorted(os.listdir()))

# To run from the common drive:
dataset_path = '/content/drive/MyDrive/Praktikum - NLP Applications/Models/bertopic_concatenated_chunks_stratified'
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import os
import re
import time
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from gensim.models import Word2Vec, KeyedVectors

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
embeddings_path = '/content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/'
csv_file_path = '/results/topics_for_bias_analysis.csv'

df = pd.read_csv(dataset_path + csv_file_path)
data = df[df['topic'] != -1]
data = data[data['genre'] != 'misc']

In [None]:
wordset_path = '/content/drive/MyDrive/Praktikum - NLP Applications/WEAT/betti2023_weat_wordsets.csv'
wordsets = pd.read_csv(wordset_path)

In [None]:
wordsets.head()

Unnamed: 0,set_name,words
0,African American names,"['shavonn', 'yolanda', 'torrance', 'jerome', '..."
1,Appearance words,"['sensual', 'thin', 'handsome', 'feeble', 'bal..."
2,Arts words,"['shakespeare', 'symphony', 'literature', 'poe..."
3,Career words,"['business', 'corporation', 'career', 'salary'..."
4,European American names,"['jay', 'melanie', 'justin', 'stephanie', 'ada..."


In [None]:
male_words = ['he', 'him', 'his', 'father', 'papa', 'dad', 'son', 'uncle', 'grandfather', 'grandpa', 'man', 'male', 'brother', 'husband', 'boyfriend', 'sir', 'king', 'guy', 'father-in-law', 'son-in-law', 'nephew', 'boy']
female_words = ['she', 'her', 'hers', 'mother', 'mama', 'daughter', 'aunt', 'auntie', 'grandmother', 'woman', 'female', 'sister', 'mom', 'wife', 'girlfriend', 'madam', 'queen', 'gal', 'niece', 'grandmother-in-law', 'daughter-in-law', 'lady', 'miss', 'sis', 'girl']

In [None]:
import ast

male_set = wordsets[wordsets['set_name'] == 'Male attributes']['words'].tolist()[0]
male_set = ast.literal_eval(male_set)
male_attribute_set = set(male_set).union(set(male_words))

male_attribute_set
# male_set[0]

{'boy',
 'boyfriend',
 'brother',
 'dad',
 'father',
 'father-in-law',
 'grandfather',
 'grandpa',
 'guy',
 'he',
 'him',
 'his',
 'husband',
 'king',
 'male',
 'man',
 'nephew',
 'papa',
 'sir',
 'son',
 'son-in-law',
 'uncle'}

In [None]:

female_set = wordsets[wordsets['set_name'] == 'Female attributes']['words'].tolist()[0]
female_set = ast.literal_eval(female_set)
female_attribute_set = set(female_set).union(set(female_words))

female_attribute_set

{'aunt',
 'auntie',
 'daughter',
 'daughter-in-law',
 'female',
 'gal',
 'girl',
 'girlfriend',
 'grandmother',
 'grandmother-in-law',
 'her',
 'hers',
 'lady',
 'madam',
 'mama',
 'miss',
 'mom',
 'mother',
 'niece',
 'queen',
 'she',
 'sis',
 'sister',
 'wife',
 'woman'}

In [None]:
target_sets = ['Pleasant', 'Unpleasant', 'Appearance words', 'Intelligence words', 'Strenght words', 'Weakness words']

ast.literal_eval(wordsets[wordsets['set_name'] == target_sets[0]]['words'].tolist()[0])

['friend',
 'joy',
 'wonderful',
 'vacation',
 'love',
 'honest',
 'honor',
 'pleasure',
 'loyal',
 'family',
 'peace',
 'heaven',
 'cheer',
 'freedom',
 'diploma',
 'gentle',
 'happy',
 'paradise',
 'diamond',
 'laughter',
 'sunrise',
 'gift',
 'health',
 'rainbow',
 'caress',
 'lucky',
 'miracle']

In [None]:
import numpy as np
from gensim.models import KeyedVectors
from nltk.corpus import wordnet as wn

import numpy as np
from nltk.corpus import wordnet as wn

# Ensure you have downloaded the necessary NLTK data
import nltk
nltk.download('wordnet')

def cosine_similarity(embedding1, embedding2):
    norm1 = np.linalg.norm(embedding1)
    norm2 = np.linalg.norm(embedding2)
    return np.dot(embedding1, embedding2) / (norm1 * norm2)

def get_concept_words(word):
    synsets = wn.synsets(word)
    return [lemma.name().replace('_', ' ') for synset in synsets for lemma in synset.lemmas() if synset.pos() in {'n', 'v'}]

def flatten(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

def average_embedding(words, embeddings_model):
    valid_embeddings = [embeddings_model.wv[word] for word in words if word in embeddings_model.wv.key_to_index]
    return np.mean(valid_embeddings, axis=0) if valid_embeddings else np.zeros(embeddings_model.vector_size)

# SC-WEAT effect size calculation function with concept words for the target set
def sc_weat_effect_size_target_concepts(target_words, male_words, female_words, embeddings_model, average_score = True):
    # Generate concept words for the target set
    # target_concepts = flatten([get_concept_words(word) for word in target_words])
    word_scweat_dict = []

    # Calculate average embeddings for male and female attribute sets
    male_avg_embedding = average_embedding(male_words, embeddings_model)
    female_avg_embedding = average_embedding(female_words, embeddings_model)

    diffs = []
    for target_word in target_words:
        if target_word in embeddings_model.wv.key_to_index:
            target_embedding = embeddings_model.wv[target_word]
            diff = cosine_similarity(target_embedding, male_avg_embedding) -
                   cosine_similarity(target_embedding, female_avg_embedding)
            diffs.append(diff)

            word_scweat_dict.append({
                'target_word': target_word,
                'scweat_score': diff
            })

    effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
    return (effect_size, word_scweat_dict)

# A positive effect size suggests that the concept words associated with a topic are more closely
# related to the male words than to the female words. This could indicate a male-gendered bias in the context of that topic.


def print_sc_weat_results(topic, score):
    print(f"Topic: {topic}")
    print(f"  SCWEAT effect: {score}")
    print("")


def scweat(male_attributes, female_attributes, target_set, embeddings_model):
  # Define your target and attribute word sets
  male_words = [word for word in male_attributes if word in embeddings_model.wv.key_to_index]
  female_words = [word for word in female_attributes if word in embeddings_model.wv.key_to_index]

  # Create a mapping dictionary to associate topic labels with words
  # topic_words_mapping = {}
  sc_weat_results = {}

  effect_size, word_scweat_scores = sc_weat_effect_size_target_concepts(target_set, male_words, female_words, embeddings_model)
  # sc_weat_results["topic"] = topic_label
  sc_weat_results["effect_size"] = effect_size
  sc_weat_results["word_scores"] = word_scweat_scores

  return sc_weat_results

  # for topic_label in aggregated_lyrics['topic'].unique():
  #     # Logic to obtain top words for each topic
  #     top_words = aggregated_lyrics[(aggregated_lyrics['topic'] == topic_label)]['top_words'].tolist()
  #     top_words = flatten(top_words)
  #     top_words_cleared = [item[0] for item in top_words]
  #     topic_words_mapping[topic_label] = top_words_cleared
  #     target_set=top_words_cleared
  #     topic = get_topic_label(topic_label)

  #     if target_set:
  #         effect_size, word_scweat_scores = sc_weat_effect_size_target_concepts(target_set, male_words, female_words, word_embeddings_model)
  #         print_sc_weat_results(topic,  effect_size)
  #         sc_weat_results.append({
  #             "topic_number": str(topic_label),
  #             "topic_label": topic,
  #             "effect_size": effect_size,
  #             "word_scores": word_scweat_scores
  #         })
  #     else:
  #         print(f"Topic {topic} - Not enough data for SC-WEAT analysis")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
embeddings_path = '/content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/'

def find_scweat_score(embedding_model):
  results = []

  for target_set_name in target_sets:

    print("Target set:", target_set_name)

    target_set = ast.literal_eval(wordsets[wordsets['set_name'] == target_set_name]['words'].tolist()[0])
    scweat_results = scweat(
        male_attributes=male_attribute_set,
        female_attributes=female_attribute_set,
        target_set=target_set,
        embeddings_model=embedding_model
    )
    results.append({
        "target_set": target_set_name,
        "target_scweat_results": scweat_results
    })

  return results

    # print("\n")

def scweat_for_genre(embeddings_path, genre = None):
  results = []
  if genre is not None:
    embeddings_path += '/' + genre + '/'

  model_list = os.listdir(embeddings_path)

  for model_name in model_list:

    if 'model' not in model_name:
      continue

    print("Topic:", model_name)

    embeddings_model_path = embeddings_path + model_name
    # print(embeddings_model_path)
    embeddings_model = Word2Vec.load(embeddings_model_path)
    scweat_results = find_scweat_score(embeddings_model)

    results.append({
        "topic": model_name.split(".")[0],
        "topic_scweat_results": scweat_results
    })

    print("\n\n")

  return results

In [None]:
final_results = []
genres = [None, 'country', 'pop', 'rap', 'rb', 'rock']

for genre in genres:
  if genre is None:
    print("Genre: --")

  else:
    print("Genre:", genre)

  genre_results = scweat_for_genre(embeddings_path, genre)
  final_results.append({
      "genre": genre,
      "scweat_results": genre_results
  })

Genre: --
Topic: nigga_niggas_bitch.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: body_girl_baby.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: tears_heart_wish.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: jesus_praise_lord.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: heartache_bah_bah bah.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: ayy ayy_change_long se

  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0


Target set: Strenght words
Target set: Weakness words



Topic: shes_shell_borderlines.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: country_hank_thats right.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: beautiful beautiful_na_beautiful.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: country_mud_redneck.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: jesus_praise_lord.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set:

  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0


Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: tears_heart_wish.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: body_girl_baby.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: heartache_bah_bah bah.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: ayy ayy_change_long sentiment.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: dance_funky_dance dance.model
Target set: Pleasant
T

  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0


Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: dance_funky_dance dance.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: heartache_bah_bah bah.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: queen black_baby love_love baby.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: ra_viral_body body.model


  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0


Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: night change_tonight need_wake love.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: comea_stereotype_help wait.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: chicka_chicka boom_boom chicka.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Genre: rock
Topic: ayy ayy_change_long sentiment.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: 

  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0


Target set: Strenght words
Target set: Weakness words



Topic: heartache_bah_bah bah.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: black gold_gold black_gold.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: beautiful beautiful_na_beautiful.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words



Topic: river_youth youth_mighty river.model
Target set: Pleasant
Target set: Unpleasant
Target set: Appearance words
Target set: Intelligence words
Target set: Strenght words
Target set: Weakness words





  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0
  effect_size = np.mean(diffs) / np.std(diffs) if diffs else 0


In [None]:
final_results

[{'genre': None,
  'scweat_results': [{'topic': 'nigga_niggas_bitch',
    'topic_scweat_results': [{'target_set': 'Pleasant',
      'target_scweat_results': {'effect_size': -0.032685347,
       'word_scores': [{'target_word': 'friend', 'scweat_score': -0.060518026},
        {'target_word': 'joy', 'scweat_score': -0.04297009},
        {'target_word': 'wonderful', 'scweat_score': -0.067364216},
        {'target_word': 'vacation', 'scweat_score': -0.039269432},
        {'target_word': 'love', 'scweat_score': -0.09398846},
        {'target_word': 'honest', 'scweat_score': 0.049852274},
        {'target_word': 'honor', 'scweat_score': 0.037947595},
        {'target_word': 'pleasure', 'scweat_score': 0.0220429},
        {'target_word': 'loyal', 'scweat_score': 0.008782214},
        {'target_word': 'family', 'scweat_score': -0.01732105},
        {'target_word': 'peace', 'scweat_score': 0.055117056},
        {'target_word': 'heaven', 'scweat_score': 0.05906351},
        {'target_word': 'cheer'

In [None]:
# convert to df

rows = []

# appending rows
for topic in final_results:
    for topic_results in topic['scweat_results']:
      for target_set_results in topic_results['topic_scweat_results']:
        row = {}
        row['genre'] = topic['genre']
        row['topic'] = topic_results['topic']
        row['target_set'] = target_set_results['target_set']
        row['effect_size'] = target_set_results['target_scweat_results']['effect_size']
        row['word_scores'] = target_set_results['target_scweat_results']['word_scores']

        rows.append(row)

results_df = pd.DataFrame(rows)


In [None]:
weat_results_path = '/content/drive/MyDrive/Praktikum - NLP Applications/WEAT/weat_results.csv'
results_df.to_csv(weat_results_path)