In [163]:
import pandas as pd
import time
import re 
import numpy as np
import json
from nltk.probability import FreqDist

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import spacy
nlp = spacy.load('en_core_web_md')

song_files = ['2019-3-1','2019-3-15','2019-4-1','2019-4-15','2019-5-1']

In [28]:
def entity_frequency(df):
    pbar = tqdm_notebook(total=len(df))
    entities_list = []

    for i, row in df.iterrows():
        doc = nlp(row['lyrics'])
        entities = [e.text for e in doc.ents 
                    if e.label_ not in ['LOC', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL', 'DATE', 'TIME']]
        entities_list.extend(entities)
        pbar.update(1)

    pbar.close()

    return FreqDist(entities_list)

In [48]:
def word_frequency(df):
    pbar = tqdm_notebook(total=len(df))
    tokens_list = []

    for i, row in df.iterrows():
        tokens = nlp(row['lyrics'].lower())
        tokens = [t.lemma_ for t in tokens if not t.is_stop and not t.is_punct]
        tokens_list.extend(tokens)
        pbar.update(1)

    pbar.close()

    return FreqDist(tokens_list)

In [137]:
song_info = pd.read_csv('../data/lyrics/song_info.txt')
song_info = song_info[song_info['lang'] == 'en']
reg = "[\(\[].*?[\)\]]"
song_info['lyrics'] = song_info['lyrics'].replace(reg, '', regex=True)
song_info['lyrics'] = song_info['lyrics'].str.replace('\r', ' ').str.replace('\n', ' ').str.replace('\t', ' ')
song_info['lyrics'] = song_info['lyrics'].str.replace(r"\s\s+", ' ')


In [4]:
songs_3_17 = songs.merge(right=song_info[['song_id', 'lyrics']], how='inner', on='song_id')
songs_3_17['country'] = songs_3_17.location.str[-2:]
songs_3_17.head()

Unnamed: 0,location,playlist_id,artist,title,song_id,lyrics,country
0,São Paulo BR,6QOjOvLDWKVNCW9H1J9vCY,Leschea,Fulton St.,3xIHePHJbOEtnzlwAOoCtz,He be rollin' in his jeep Cruisin' down on Ful...,BR
1,Osasco BR,1rfYB95mbIDLnWOJCVr1AP,Leschea,Fulton St.,3xIHePHJbOEtnzlwAOoCtz,He be rollin' in his jeep Cruisin' down on Ful...,BR
2,Los Angeles California US,53JqQphsAHvDGGVZErKmW5,Suga Free,Why U Bullshittin'?,3zb1zBmkrPF6VW8RhlSYA1,"So pay attention, babe Pay attention Our Fathe...",US
3,Las Vegas Nevada US,16olvYpf0K5nWJ98Oxa4TG,Suga Free,Why U Bullshittin'?,3zb1zBmkrPF6VW8RhlSYA1,"So pay attention, babe Pay attention Our Fathe...",US
4,Corona California US,1lgTx3EbMGfQZ38U3EUjCk,Suga Free,Why U Bullshittin'?,3zb1zBmkrPF6VW8RhlSYA1,"So pay attention, babe Pay attention Our Fathe...",US


In [7]:
vader_scores = pd.read_csv('../data/vader_scores.txt')
songs_3_17 = songs_3_17.merge(right=vader_scores, how='inner', on='song_id')
songs_3_17.head()

Unnamed: 0,location,playlist_id,artist,title,song_id,lyrics,country,vader_score
0,São Paulo BR,6QOjOvLDWKVNCW9H1J9vCY,Leschea,Fulton St.,3xIHePHJbOEtnzlwAOoCtz,He be rollin' in his jeep Cruisin' down on Ful...,BR,0.220677
1,Osasco BR,1rfYB95mbIDLnWOJCVr1AP,Leschea,Fulton St.,3xIHePHJbOEtnzlwAOoCtz,He be rollin' in his jeep Cruisin' down on Ful...,BR,0.220677
2,Los Angeles California US,53JqQphsAHvDGGVZErKmW5,Suga Free,Why U Bullshittin'?,3zb1zBmkrPF6VW8RhlSYA1,"So pay attention, babe Pay attention Our Fathe...",US,-0.01566
3,Las Vegas Nevada US,16olvYpf0K5nWJ98Oxa4TG,Suga Free,Why U Bullshittin'?,3zb1zBmkrPF6VW8RhlSYA1,"So pay attention, babe Pay attention Our Fathe...",US,-0.01566
4,Corona California US,1lgTx3EbMGfQZ38U3EUjCk,Suga Free,Why U Bullshittin'?,3zb1zBmkrPF6VW8RhlSYA1,"So pay attention, babe Pay attention Our Fathe...",US,-0.01566


In [88]:
i = np.random.randint(0, len(song_info))
doc = nlp(song_info.iloc[i]['lyrics'])
ents = [(e.text) for e in doc.ents if e.label_ not in ['LOC', 'MONEY', 'QUANTITY', 'PERSON', 'ORDINAL', 'CARDINAL', 'DATE', 'TIME']]
print(ents)

['Somewhere', 'Drank', 'Drank', 'Drank', 'Drank', 'Drank']


In [43]:
fdist = entity_frequency(df)
most_common = fdist.most_common(30)
print(most_common)

HBox(children=(IntProgress(value=0, max=446), HTML(value='')))

[('Bitch', 18), ('Baby', 17), ('Keep', 13), ('Ooh', 12), ('Neighborhood', 10), ('Fuck', 9), ('Cause', 9), ('Sam', 8), ('Nellis', 8), ('Never', 6), ('Sittin', 6), ('Niggas', 6), ('Girl', 6), ('Down', 6), ('Angel', 5), ('Young', 5), ('Damn', 5), ('Candy', 5), ('Nigga', 5), ('Boulder Highway', 5), ('Pour', 5), ('Catch', 4), ('Makin', 4), ('Toot', 4), ('Step', 4), ('Yuh', 4), ('Lil Rob', 4), ('Swear', 4), ('El Monte', 4), ('Imma', 4)]


In [48]:
songs_file = '../data/songs/songs_2019-5-2.txt'
songs = pd.read_csv(songs_file)

songs = songs.merge(right=song_info[['song_id', 'lyrics']], how='inner', on='song_id')
songs['country'] = songs.location.str[-2:]

In [49]:
df = songs[songs['location'] == 'Las Vegas Nevada US'].copy()
# df = songs_3_17[songs_3_17['country'] == 'FR'].copy()
df['song_id'].nunique()

65

In [50]:
fdist = entity_frequency(df)
most_common = fdist.most_common(30)
print(most_common)

HBox(children=(IntProgress(value=0, max=65), HTML(value='')))

[('Keep', 25), ('Baby', 21), ('’m', 18), ('Ooh', 15), ('Pistol', 15), ('Duke', 15), ('Niggas', 13), ('Nigga', 10), ('Suavecito', 10), ('Fuck', 9), ('Girl', 9), ('Bitch', 8), ('Sam', 8), ('Nellis', 8), ('Cauz', 8), ('Ridin', 7), ('Damn', 7), ('Down', 7), ('Ac', 6), ('Chuckin', 6), ('Sittin', 6), ('Cause', 6), ('Earl Duke', 6), ('Angel', 5), ('Never', 5), ('Candy', 5), ('Boulder Highway', 5), ('Ese', 5), ('Young', 5), ('Matter', 4)]


### Most Common Words:  Murfressboro Tennessee US vs. NYC

In [121]:
location = 'Murfreesboro Tennessee US'
songs_date = song_files[0]
songs_file_path = '../data/songs/songs_'+songs_date+'.txt'
songs = pd.read_csv(songs_file_path)

songs = songs.merge(right=song_info[['song_id', 'lyrics']], how='inner', on='song_id')
df = songs[songs['location'] == location].copy()
fdist = word_frequency(df)
most_common = fdist.most_common(20)
print('Location:', location, ', Date:', songs_date)
print(most_common)

HBox(children=(IntProgress(value=0, max=88), HTML(value='')))

Location: Murfreesboro Tennessee US , Date: 2019-3-1
[('like', 186), ('know', 160), ('love', 141), ('go', 139), ('get', 136), ('oh', 119), ('yeah', 102), ('want', 93), ('come', 88), ('life', 85), ('old', 79), ('time', 78), ('be', 77), ('night', 74), ('let', 73), ('long', 72), ('hope', 70), ('to', 68), ('bitch', 66), ('right', 65)]


In [122]:
location = 'New York New York US'
songs_date = song_files[0]
songs_file_path = '../data/songs/songs_'+songs_date+'.txt'
songs = pd.read_csv(songs_file_path)

songs = songs.merge(right=song_info[['song_id', 'lyrics']], how='inner', on='song_id')
df = songs[songs['location'] == location].copy()
fdist = word_frequency(df)
most_common = fdist.most_common(20)
print('Location:', location, ', Date:', songs_date)
print(most_common)

HBox(children=(IntProgress(value=0, max=39), HTML(value='')))

Location: New York New York US , Date: 2019-3-1
[('oh', 133), ('know', 90), ('get', 72), ('love', 66), ('yeah', 62), ('think', 59), ('like', 57), ('to', 56), ('go', 55), ('wanna', 54), ('time', 54), ('baby', 53), ('da', 52), ('tell', 43), ('world', 40), ('be', 37), ('look', 36), ('believe', 36), ('good', 36), ('new', 36)]


### Entities: Murfressboro Tennessee vs. NYC

In [139]:
location = 'Murfreesboro Tennessee US'
songs_date = song_files[0]
songs_file_path = '../data/songs/songs_'+songs_date+'.txt'
songs = pd.read_csv(songs_file_path)

songs = songs.merge(right=song_info[['song_id', 'lyrics']], how='inner', on='song_id')
df = songs[songs['location'] == location].copy()
fdist = entity_frequency(df)
most_common = fdist.most_common(20)
print('Location:', location, ', Date:', songs_date)
print(most_common)

HBox(children=(IntProgress(value=0, max=88), HTML(value='')))

Location: Murfreesboro Tennessee US , Date: 2019-3-1
[('Jesus', 29), ('Morocco', 13), ('Bubba', 12), ('Tennessee', 11), ('Hank', 10), ('Baby', 7), ('David Ashley Parker', 7), ('Son', 7), ('Rocky Top', 6), ('Bae', 6), ('Said', 6), ('Love', 6), ('Louisiana', 6), ('Bitch', 4), ('USA', 4), ('Free', 4), ('Powder Springs', 4), ('Kick', 4), ('Daniel', 4), ('Moses', 4)]


In [143]:
location = 'New York New York US'
songs_date = song_files[0]
songs_file_path = '../data/songs/songs_'+songs_date+'.txt'
songs = pd.read_csv(songs_file_path)

songs = songs.merge(right=song_info[['song_id', 'lyrics']], how='inner', on='song_id')
df = songs[songs['location'] == location].copy()
fdist = entity_frequency(df)
most_common = fdist.most_common(20)
print('Location:', location, ', Date:', songs_date)
print(most_common)

HBox(children=(IntProgress(value=0, max=39), HTML(value='')))

Location: New York New York US , Date: 2019-3-1
[('Cupid', 13), ('Time', 5), ('Broadway', 4), ('Comin', 4), ('Eating', 4), ('Circle', 3), ('Bentley', 3), ('Walkin', 3), ('Baby', 3), ("Nothin'", 3), ('Lalala', 2), ('Lambo', 2), ('Harlem', 2), ('Ooo', 2), ('Friendly', 2), ('Darlin', 2), ('Vegas', 1), ('PAMELA', 1), ('Divinest Pamela', 1), ('Lying by the sycamore', 1)]
