This notebook is used to generate Word2Vec embedding models for the lyrics of the top 10 topics overall and for each of the 5 genres. Therefore, we generate 60 embedding models, which are then used for the SC-WEAT analysis

In [None]:
from google.colab import drive
import os

  # gdrive_path='/content/gdrive/MyDrive/Bertopic/shared_work/'

  # # This will mount your google drive under 'MyDrive'
# drive.mount('/content/gdrive', force_remount=True)
# # In order to access the files in this notebook we have to navigate to the correct folder
# os.chdir(gdrive_path)
# dataset_path = ''
# # Check manually if all files are present
# print(sorted(os.listdir()))

# To run from the common drive:
dataset_path = '/content/drive/MyDrive/Praktikum - NLP Applications/Models/bertopic_concatenated_chunks_stratified'
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import os
import re
import time

from gensim.models import Word2Vec
from tqdm import tqdm

tqdm.pandas()

In [None]:
# Set all the paths

folder = '/content/drive/MyDrive/Praktikum - NLP Applications/'
dataset_path = '/content/drive/MyDrive/Praktikum - NLP Applications/Models/bertopic_concatenated_chunks_stratified'
csv_file_path = '/results/topics_for_bias_analysis.csv'
df = pd.read_csv(dataset_path + csv_file_path)

embeddings_path = folder + 'WEAT/embeddings/'

Training word vectors for top 10 topics overall

In [None]:
data = df[df['topic'] != -1]
data = data[data['genre'] != 'misc']

In [None]:
data = data[['genre','cleaned_lyrics', 'topic', 'topic_label']]
data

Unnamed: 0,genre,cleaned_lyrics,topic,topic_label
0,rap,"\nKilla Cam, Killa Cam, Cam\nKilla Cam, Killa ...",7,mi_dem_yuh
1,rap,"\n\n\nUgh, Killa!\nBaby!\nKanye, this that 197...",0,nigga_niggas_bitch
2,rap,"\nKilla, Dipset\nMan I spit that pimp talk, yo...",434,bicyclette_ma bicyclette_bicyclette ma
3,rap,\nAy yo you wonder who I are\nI guzzle up at t...,434,bicyclette_ma bicyclette_bicyclette ma
4,rap,"\nNow Lord you know, just how hard I try\nTo l...",398,dog status_movin rank_status im
...,...,...,...,...
537548,pop,"\n\nYou, Lord, are forgiving and good\nAboundi...",1,jesus_praise_lord
537549,pop,"\n\n\nI'm done\nTake it deep, join the fun\nJu...",3,dance_funky_dance dance
537550,rap,\nUh (Hit-Boy)\nAh\n\nThey wanna take me out l...,66,ra_viral_body body
537551,rock,"\nLook out, stupid jerk\nI'm going to break in...",98,military time_military_time military


In [None]:
topic_label_dict = data.groupby('topic')['topic_label'].unique().to_dict()
# topic_label_dict[1]

In [None]:
topic_label_dict[1][0]

'jesus_praise_lord'

In [None]:
top10 = data['topic'].value_counts().nlargest(10).to_dict()

In [None]:
lyrics = data['cleaned_lyrics'].tolist()

In [None]:
# Preprocess the lyrics in order to pass to Word2Vec

def flatten(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

def generate_sentences(lyrics_list):
  sentences = []
  for lyrics in lyrics_list:
    split_lyrics = lyrics.split('\n')

    if '' in split_lyrics:
      split_lyrics.remove('')

    split_lyrics = [x.split(' ') for x in split_lyrics]

    sentences.append(split_lyrics)

  return flatten(sentences)
  # return sentences

test_sen = generate_sentences(lyrics[:2])

In [None]:
test_sen[:15]

[['Killa', 'Cam,', 'Killa', 'Cam,', 'Cam'],
 ['Killa', 'Cam,', 'Killa', 'Cam'],
 ['Killa', 'Cam,', 'Cam'],
 ['Killa', 'Cam,', 'Killa', 'Cam,', 'Cam'],
 ['Killa', 'Killa', 'Killa', 'Cam'],
 ['Killa', 'Cam,', 'Cam,', 'Killa', '(Killa!)'],
 ['Killa', 'Cam,', 'Killa', 'Cam,', 'Cam', '(Bases', 'loaded)'],
 ['Killa', 'Cam,', 'Killa', 'Cam', '(Uh-huh)'],
 ['Killa', 'Cam,', 'Cam', '(Santana', 'on', 'second,', 'Jim', 'on', 'third)'],
 ['Killa', 'Cam,', 'Killa', 'Cam,', 'Cam', "(I'm", 'at', 'bat)'],
 ['Killa', 'Killa', 'Killa', 'Cam'],
 ['Killa',
  'Cam,',
  'Cam,',
  'Killa',
  "(I'm",
  "'bout",
  'to',
  'hit',
  'this',
  'shit',
  'out',
  'the',
  'world)'],
 ['Killa', 'Cam', '(Ugh,', 'Heatmakerz),', 'Killa', 'Cam,', 'Cam'],
 ['Killa', 'Cam,', 'Killa', 'Cam'],
 ['Killa', 'Cam,', 'Cam', '(Hahahaha)']]

In [None]:
# train word embeddings

# define hyperparameters
CONTEXT_WINDOW = 5
EPOCHS = 15


for topic in top10.keys():
  subset = data[data['topic'] == topic]['cleaned_lyrics'].tolist()
  sentences = generate_sentences(subset)

  print("Training word embeddings for:", topic_label_dict[topic][0])

  model = Word2Vec(
      sentences=sentences,
      window=CONTEXT_WINDOW,
      epochs=EPOCHS,
      # workers=4,
  )

  model_path = embeddings_path + topic_label_dict[topic][0] + ".model"
  print("Completed training, saving model to path:", model_path)

  model.save(model_path)

Training word embeddings for: nigga_niggas_bitch
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/nigga_niggas_bitch.model
Training word embeddings for: body_girl_baby
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/body_girl_baby.model
Training word embeddings for: tears_heart_wish
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/tears_heart_wish.model
Training word embeddings for: jesus_praise_lord
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/jesus_praise_lord.model
Training word embeddings for: heartache_bah_bah bah
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/heartache_bah_bah bah.model
Training word embeddings for: ayy ayy_change_long sentiment
Completed training, saving m

In [None]:
def train_embeddings_genre(genre):
  genre_subset = data[data['genre'] == genre]
  top10_genre = genre_subset['topic'].value_counts().nlargest(10).to_dict()
  embeddings_path_genre = embeddings_path + genre + '/'

  print("Genre:", genre)

  for topic in top10_genre.keys():
    subset = genre_subset[genre_subset['topic'] == topic]['cleaned_lyrics'].tolist()
    sentences = generate_sentences(subset)

    print("Training word embeddings for:", topic_label_dict[topic][0])

    model = Word2Vec(
        sentences=sentences,
        window=CONTEXT_WINDOW,
        epochs=EPOCHS,
        workers=4,
    )

    model_path = embeddings_path_genre + topic_label_dict[topic][0] + ".model"
    print("Completed training, saving model to path:", model_path)

    model.save(model_path)
    print("Saved model! \n\n\n")

  print("------------------------------------------------------")

In [None]:
genres = data['genre'].unique()

for genre in genres:
  train_embeddings_genre(genre)

Genre: rap
Training word embeddings for: nigga_niggas_bitch
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/rap/nigga_niggas_bitch.model
Saved model! 



Training word embeddings for: military time_military_time military
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/rap/military time_military_time military.model
Saved model! 



Training word embeddings for: em youre_worth em_ahahah
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/rap/em youre_worth em_ahahah.model
Saved model! 



Training word embeddings for: tut_tut tut_aes
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - NLP Applications/WEAT/embeddings/rap/tut_tut tut_aes.model
Saved model! 



Training word embeddings for: ra_viral_body body
Completed training, saving model to path: /content/drive/MyDrive/Praktikum - N

In [None]:
genres

array(['rap', 'pop', 'rock', 'rb', 'country'], dtype=object)

In [None]:
model

<gensim.models.word2vec.Word2Vec at 0x7e31ca16d570>

In [None]:
model = Word2Vec.load(embeddings_path + "jesus_praise_lord.model")

In [None]:
model.wv['lord']

array([-0.27864978,  0.7694193 , -0.5296154 , -1.1845462 , -1.0364057 ,
       -0.1879887 ,  1.2720504 , -1.300427  ,  1.6320828 ,  0.28715804,
       -0.92939734,  1.4577465 ,  0.15327337,  0.00814943, -0.3788575 ,
       -0.10454856,  3.0149782 ,  0.441192  ,  2.9818842 ,  0.6624756 ,
        1.098098  , -0.5982991 , -0.78277785,  1.3887686 ,  2.0384119 ,
        0.14081876, -0.9331827 ,  1.0206437 ,  0.64292425, -0.6626616 ,
        0.565181  ,  0.45031783,  0.13208607,  0.24756254, -0.641565  ,
       -0.75680465, -0.97717214, -0.24054453,  1.883056  , -0.5814508 ,
        1.372016  ,  0.01347984,  0.6830271 ,  0.9436994 ,  0.33444133,
        0.54128355,  0.7672659 ,  0.30818304,  2.5252652 ,  1.5387765 ,
       -0.11501274, -0.48820847, -1.1302735 , -1.6314297 , -0.25779662,
        2.2845857 , -0.7111418 , -0.9763985 , -1.110613  ,  0.365783  ,
       -0.4004064 , -0.20003852,  1.1734723 , -1.1676503 ,  0.2828832 ,
        2.1369872 ,  0.29559654, -1.3704813 , -1.9076835 ,  1.13

In [None]:
model.wv.vector_size

100

In [None]:
model.wv.most_similar('friend')

[('friend,', 0.6158629059791565),
 ('lover', 0.5654369592666626),
 ('brother', 0.5462185740470886),
 ('wife', 0.5325285792350769),
 ('man', 0.5284438133239746),
 ('lifeline', 0.5077045559883118),
 ('mother', 0.4977668523788452),
 ('liar', 0.4885673224925995),
 ('child', 0.48657697439193726),
 ('dream', 0.4807463586330414)]