In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/Anna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Anna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/Anna/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df = pd.read_csv("clean_corpus_eng_only.csv")

In [3]:
print(len(df))
df.head()

15387


Unnamed: 0,rapper,song,year,lyrics
0,Cordae,RNP,2019.0,"Uh, okay, put your fucking hands up, this the ..."
1,Cordae,Kung Fu,2018.0,I'ma just get it and get it again\nCome ups I ...
2,Cordae,Bad Idea,2019.0,I know myself all too well to be a stranger of...
3,Cordae,Have Mercy,2019.0,There's no complainin' on this side\nMy niggas...
4,Cordae,Old Niggas,2018.0,"Uh, old niggas and new niggas, now what's the ..."


In [4]:
df["lyrics"] = df["lyrics"].apply(lambda text: str(text).replace('|', ' \n'))

In [5]:
df["lyrics_splitted"] = df["lyrics"].apply(lambda text: str(text).replace('\n', ' '))

In [6]:
df["lyrics_splitted_lines"] = df["lyrics"].apply(lambda text: str(text).splitlines())

In [7]:
df['lyrics_tokenized'] = df["lyrics_splitted"].apply(lambda text: ' '.join(word_tokenize(text)))

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Anna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
lemmatizer = WordNetLemmatizer()
df["lyrics_lemmatized"] = df["lyrics_tokenized"].apply(lambda text: [lemmatizer.lemmatize(word) for word in text.split(' ')])

In [10]:
df["formula_num_sent"] = df["lyrics_splitted_lines"].apply(lambda text: len(text))

In [11]:
df_slurs = pd.read_csv("slurs.csv", sep=";")

In [12]:
df_slurs.head()

Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,category_1,category_2,category_3,severity_rating,severity_description
0,69,69,,,sexual anatomy / sexual acts,,,1.0,Mild
1,@55,ass,,,sexual anatomy / sexual acts,,,1.0,Mild
2,rat baztad,bastard,,,animal references,,,1.8,Strong
3,ape shit,ape,shit,,bodily fluids / excrement,,,1.6,Strong
4,ape shite,ape,shit,,bodily fluids / excrement,animal references,,1.6,Strong


In [13]:
slurs = dict(zip(df_slurs.text, df_slurs.severity_rating))

In [14]:
def get_matched_words(df, slurs: dict) -> list:
    df["slurs"] = np.nan
    df["num_slurs"] = np.nan
    df["formula_song"] = np.nan
    for ind, (_, row) in tqdm(enumerate(df.iterrows())):
        match_words = []
        new_set = set(row["lyrics_lemmatized"])
        for word in new_set:
            if word in slurs:
                match_words.append(word)
        row["slurs"] = match_words
        row["num_slurs"] = len(row["slurs"])
        
        formula_song = 0.0
        for word in match_words:
            formula_song += float(slurs[word])
        row["formula_song"] = formula_song / len(row["lyrics_tokenized"])
        
        df.at[ind] = row

In [22]:
slurs

{'69': 1.0,
 '@55': 1.0,
 'rat baztad': 1.8,
 'ape shit': 1.6,
 'ape shite': 1.6,
 'blow a load': 1.6,
 '0ral seks': 1.0,
 'bonk juice': 1.6,
 '0rg@sm': 1.0,
 '0rgasms': 1.0,
 'choad nectar': 2.0,
 '4r5e': 1.4,
 '4r5ed': 1.4,
 '4r5es': 1.4,
 '4skin': 1.0,
 '5h17': 1.0,
 '5h1t': 1.0,
 'cock droplets': 2.4,
 'a_s_s': 1.0,
 'abbie': 1.2,
 'cumball': 2.4,
 'aboe': 1.0,
 'anal': 1.0,
 'cumdumpster': 2.8,
 'cumming': 1.6,
 'cumslut': 2.8,
 'cumz': 2.0,
 'darkshit': 3.0,
 'dipsh1tty': 1.6,
 'dipshidiot': 1.6,
 'dipshite': 1.6,
 'anus': 1.0,
 'dipshits': 1.6,
 'dipshitter': 1.6,
 'dipshitty': 1.8,
 'apeshit': 1.4,
 'apeshite': 1.4,
 'ar5e': 1.0,
 'ar5ehole': 1.4,
 'arse': 1.0,
 'dumbshit': 1.6,
 'arse hole': 1.2,
 'dump a load': 1.6,
 'erectoplasm': 1.8,
 'fagshit': 3.0,
 'gay shit': 1.8,
 'gobshite': 1.8,
 'jizz': 1.6,
 'jizz-jockey': 2.2,
 'jizzbags': 2.4,
 'arsehole': 1.4,
 'jizzed': 2.0,
 'arsewipe': 1.4,
 'jizzes': 2.2,
 'ass': 1.0,
 'jizzfucker': 2.6,
 'jizzing': 1.6,
 'jizzstain': 2.8,


In [15]:
get_matched_words(df, slurs)

15387it [00:27, 555.38it/s]


In [16]:
category_map = []
category = dict(category_map)
for ind, (_, row) in tqdm(enumerate(df_slurs.iterrows())):
    key = row["text"]
    value = row["category_1"]
    if key not in category:
        category[key] = value

1598it [00:00, 10389.78it/s]


In [23]:
category

{'69': 'sexual anatomy / sexual acts',
 '@55': 'sexual anatomy / sexual acts',
 'rat baztad': 'animal references',
 'ape shit': 'bodily fluids / excrement',
 'ape shite': 'bodily fluids / excrement',
 'blow a load': 'bodily fluids / excrement',
 '0ral seks': 'sexual anatomy / sexual acts',
 'bonk juice': 'bodily fluids / excrement',
 '0rg@sm': 'sexual anatomy / sexual acts',
 '0rgasms': 'sexual anatomy / sexual acts',
 'choad nectar': 'bodily fluids / excrement',
 '4r5e': 'sexual anatomy / sexual acts',
 '4r5ed': 'sexual anatomy / sexual acts',
 '4r5es': 'sexual anatomy / sexual acts',
 '4skin': 'sexual anatomy / sexual acts',
 '5h17': 'bodily fluids / excrement',
 '5h1t': 'bodily fluids / excrement',
 'cock droplets': 'bodily fluids / excrement',
 'a_s_s': 'sexual anatomy / sexual acts',
 'abbie': 'racial / ethnic slurs',
 'cumball': 'bodily fluids / excrement',
 'aboe': 'racial / ethnic slurs',
 'anal': 'sexual anatomy / sexual acts',
 'cumdumpster': 'bodily fluids / excrement',
 'cu

In [17]:
def get_matched_categories(df, category: dict) -> list:
    df["category"] = np.nan
    for ind, (_, row) in tqdm(enumerate(df.iterrows())):
        match_words = []
        new_set = set(row["slurs"])
        for word in new_set:
            if word in category:
                match_words.append(category[word])
        row["category"] = match_words
        df.at[ind] = row

In [18]:
get_matched_categories(df, category)

15387it [00:22, 672.96it/s]


In [27]:
merge_category = { 
     'racial / ethnic slurs': 'racial / ethnic / religious offense',
     'religious offense': 'racial / ethnic / religious offense',
     'other / general insult': 'mental disability/ general insults',
     'mental disability': 'mental disability/ general insults', 
     'animal references': 'delete',
     'physical attributes': 'delete',
     'physical disability': 'delete',
     'political': 'delete'
}

df_slurs["merged_category"] = np.nan
        
for ind, (_, row) in tqdm(enumerate(df_slurs.iterrows())):
    category = row["category_1"]
    if category not in merge_category:
        row["merged_category"] = category
    else:
        mapped = merge_category[category]
        if mapped != "delete":
            row["merged_category"] = mapped
    df_slurs.at[ind] = row

1598it [00:01, 1036.24it/s]


In [32]:
df["merged_category"] = np.nan
        
for ind, (_, row) in tqdm(enumerate(df.iterrows())):
    row["merged_category"] = []
    for category in row["category"]:
        if category not in merge_category:
            row["merged_category"].append(category)
        else:
            mapped = merge_category[category]
            if mapped != "delete":
                row["merged_category"].append(mapped)
    df.at[ind] = row

15387it [00:25, 609.92it/s]


In [35]:
import statistics
from statistics import mode

df["popular_category"] = np.nan
        
for ind, (_, row) in tqdm(enumerate(df.iterrows())):
    if len(row["merged_category"]) > 0:
        row["popular_category"] = mode(row["merged_category"])
    df.at[ind] = row

15387it [00:25, 606.95it/s]


In [36]:
df.head()

Unnamed: 0,rapper,song,year,lyrics,lyrics_splitted,lyrics_splitted_lines,lyrics_tokenized,lyrics_lemmatized,formula_num_sent,slurs,num_slurs,formula_song,category,merged_category,popular_category
0,Cordae,RNP,2019.0,"Uh, okay, put your fucking hands up, this the ...","Uh, okay, put your fucking hands up, this the ...","[Uh, okay, put your fucking hands up, this the...","Uh , okay , put your fucking hands up , this t...","[Uh, ,, okay, ,, put, your, fucking, hand, up,...",30,"[motherfuckin, fucking, Fuck, nigga, bitch]",5.0,0.006905,"[sexual anatomy / sexual acts, sexual anatomy ...","[sexual anatomy / sexual acts, sexual anatomy ...",sexual anatomy / sexual acts
1,Cordae,Kung Fu,2018.0,I'ma just get it and get it again\nCome ups I ...,I'ma just get it and get it again Come ups I s...,"[I'ma just get it and get it again, Come ups I...",I'ma just get it and get it again Come ups I s...,"[I'ma, just, get, it, and, get, it, again, Com...",37,"[fucked, goddamn, fucking, nigga, bitch]",5.0,0.005893,"[sexual anatomy / sexual acts, sexual anatomy ...","[sexual anatomy / sexual acts, sexual anatomy ...",sexual anatomy / sexual acts
2,Cordae,Bad Idea,2019.0,I know myself all too well to be a stranger of...,I know myself all too well to be a stranger of...,[I know myself all too well to be a stranger o...,I know myself all too well to be a stranger of...,"[I, know, myself, all, too, well, to, be, a, s...",44,"[nigga, shit]",2.0,0.001683,"[bodily fluids / excrement, racial / ethnic sl...","[bodily fluids / excrement, racial / ethnic / ...",bodily fluids / excrement
3,Cordae,Have Mercy,2019.0,There's no complainin' on this side\nMy niggas...,"There's no complainin' on this side My niggas,...","[There's no complainin' on this side, My nigga...",There 's no complainin ' on this side My nigga...,"[There, 's, no, complainin, ', on, this, side,...",33,"[Fuck, nigga, shit, bitch]",4.0,0.005062,"[sexual orientation / gender, sexual anatomy /...","[sexual orientation / gender, sexual anatomy /...",sexual orientation / gender
4,Cordae,Old Niggas,2018.0,"Uh, old niggas and new niggas, now what's the ...","Uh, old niggas and new niggas, now what's the ...","[Uh, old niggas and new niggas, now what's the...","Uh , old niggas and new niggas , now what 's t...","[Uh, ,, old, nigga, and, new, nigga, ,, now, w...",63,"[fuckin, suck, nigga, bullshit]",4.0,0.002265,"[bodily fluids / excrement, sexual anatomy / s...","[bodily fluids / excrement, sexual anatomy / s...",sexual anatomy / sexual acts


In [37]:
df.to_csv('out.csv')

In [21]:
df["lyrics"][0]

"Uh, okay, put your fucking hands up, this the fucking anthem\nSmiling 'cause I'm young, rich, black, and I'm handsome\nNot to mention wealthy, ass on her healthy\nYoung millionaire, what the fuck can you tell me? Smell me?\nNigga, that's Chanel cologne\nI'm in Europe with the tourists with no cellular phone Like ooh, sound like rich nigga problems\nI hit a bad bitch with a fistful of condoms\nAnd the randomness of risky ménages\nLike get the head right, she can get what she wanted\nThe spits, then flaunt it, my drip like a faucet\nShe told me she was prego, I ain't even take the motherfuckin' dick out my pocket, yeah\nYou might also like The opposite\nShe want me to fly her, so I copped a jet\nMust be thinkin' I'm a one way ticket on a runway\nDrippin' in my feng shui, sippin' on a sundae I bought a Moncler coat for the times we were broke\nI'ma wear it in the summer on LeBron James boat\nFront row? Duh, bro, we don't sit on nosebleeds\nAin't your pockets obese? They won't fit in thos