In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/manil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/manil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/manil/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
df = pd.read_csv("clean_corpus_eng_only.csv")

In [12]:
print(len(df))
df.head()

15280


Unnamed: 0,rapper,song,year,lyrics
0,Cordae,RNP,2019.0,"Uh, okay, put your fucking hands up, this the ..."
1,Cordae,Kung Fu,2018.0,I'ma just get it and get it again\nCome ups I ...
2,Cordae,Bad Idea,2019.0,I know myself all too well to be a stranger of...
3,Cordae,Have Mercy,2019.0,There's no complainin' on this side\nMy niggas...
4,Cordae,Old Niggas,2018.0,"Uh, old niggas and new niggas, now what's the ..."


In [13]:
df["lyrics"] = df["lyrics"].apply(lambda text: str(text).replace('|', ' \n'))

In [14]:
df["lyrics_splitted"] = df["lyrics"].apply(lambda text: str(text).replace('\n', ' '))

In [15]:
df["lyrics_splitted_lines"] = df["lyrics"].apply(lambda text: str(text).splitlines())

In [16]:
df['lyrics_tokenized'] = df["lyrics_splitted"].apply(lambda text: ' '.join(word_tokenize(text)))

In [17]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/manil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
lemmatizer = WordNetLemmatizer()
df["lyrics_lemmatized"] = df["lyrics_tokenized"].apply(lambda text: [lemmatizer.lemmatize(word) for word in text.split(' ')])

In [19]:
df["formula_num_sent"] = df["lyrics_splitted_lines"].apply(lambda text: len(text))

In [20]:
df_slurs = pd.read_csv("slurs.csv", sep=";")

In [21]:
df_slurs.head()

Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,category_1,category_2,category_3,severity_rating,severity_description
0,69,69,,,sexual anatomy / sexual acts,,,1.0,Mild
1,@55,ass,,,sexual anatomy / sexual acts,,,1.0,Mild
2,rat baztad,bastard,,,animal references,,,1.8,Strong
3,ape shit,ape,shit,,bodily fluids / excrement,,,1.6,Strong
4,ape shite,ape,shit,,bodily fluids / excrement,animal references,,1.6,Strong


In [22]:
slurs = dict(zip(df_slurs.text, df_slurs.severity_rating))

In [23]:
def get_matched_words(df, slurs: dict) -> list:
    df["slurs"] = np.nan
    df["num_slurs"] = np.nan
    df["formula_song"] = np.nan
    for ind, (_, row) in tqdm(enumerate(df.iterrows())):
        match_words = []
        new_set = set(row["lyrics_lemmatized"])
        for word in new_set:
            if word in slurs:
                match_words.append(word)
        row["slurs"] = match_words
        row["num_slurs"] = len(row["slurs"])
        
        formula_song = 0.0
        for word in match_words:
            formula_song += float(slurs[word])
        row["formula_song"] = formula_song / len(row["lyrics_tokenized"])
        
        df.at[ind] = row

In [24]:
get_matched_words(df, slurs)

15280it [01:16, 198.62it/s]


In [25]:
df.head()

Unnamed: 0,rapper,song,year,lyrics,lyrics_splitted,lyrics_splitted_lines,lyrics_tokenized,lyrics_lemmatized,formula_num_sent,slurs,num_slurs,formula_song
0,Cordae,RNP,2019.0,"Uh, okay, put your fucking hands up, this the ...","Uh, okay, put your fucking hands up, this the ...","[Uh, okay, put your fucking hands up, this the...","Uh , okay , put your fucking hands up , this t...","[Uh, ,, okay, ,, put, your, fucking, hand, up,...",29,"[fucking, motherfuckin, Fuck, nigga, bitch]",5.0,0.006993
1,Cordae,Kung Fu,2018.0,I'ma just get it and get it again\nCome ups I ...,I'ma just get it and get it again Come ups I s...,"[I'ma just get it and get it again, Come ups I...",I'ma just get it and get it again Come ups I s...,"[I'ma, just, get, it, and, get, it, again, Com...",36,"[fucking, nigga, bitch, fucked, goddamn]",5.0,0.005994
2,Cordae,Bad Idea,2019.0,I know myself all too well to be a stranger of...,I know myself all too well to be a stranger of...,[I know myself all too well to be a stranger o...,I know myself all too well to be a stranger of...,"[I, know, myself, all, too, well, to, be, a, s...",44,"[shit, nigga]",2.0,0.001688
3,Cordae,Have Mercy,2019.0,There's no complainin' on this side\nMy niggas...,"There's no complainin' on this side My niggas,...","[There's no complainin' on this side, My nigga...",There 's no complainin ' on this side My nigga...,"[There, 's, no, complainin, ', on, this, side,...",32,"[Fuck, shit, nigga, bitch]",4.0,0.005132
4,Cordae,Old Niggas,2018.0,"Uh, old niggas and new niggas, now what's the ...","Uh, old niggas and new niggas, now what's the ...","[Uh, old niggas and new niggas, now what's the...","Uh , old niggas and new niggas , now what 's t...","[Uh, ,, old, nigga, and, new, nigga, ,, now, w...",63,"[bullshit, fuckin, suck, nigga]",4.0,0.002284


In [26]:
df["lyrics"][0]

"Uh, okay, put your fucking hands up, this the fucking anthem\nSmiling 'cause I'm young, rich, black, and I'm handsome\nNot to mention wealthy, ass on her healthy\nYoung millionaire, what the fuck can you tell me? Smell me?\nNigga, that's Chanel cologne\nI'm in Europe with the tourists with no cellular phone Like ooh, sound like rich nigga problems\nI hit a bad bitch with a fistful of condoms\nAnd the randomness of risky ménages\nLike get the head right, she can get what she wanted\nThe spits, then flaunt it, my drip like a faucet\nShe told me she was prego, I ain't even take the motherfuckin' dick out my pocket, yeah The opposite\nShe want me to fly her, so I copped a jet\nMust be thinkin' I'm a one way ticket on a runway\nDrippin' in my feng shui, sippin' on a sundae I bought a Moncler coat for the times we were broke\nI'ma wear it in the summer on LeBron James boat\nFront row? Duh, bro, we don't sit on nosebleeds\nAin't your pockets obese? They won't fit in those seats\nAyy, we like

In [27]:
df.to_csv("slurs_annotated_data.csv")