In [214]:
import pandas as pd
import os
import datetime as dt

# Popular Music Arist Data
In this section of the notebook, we collect data on popular music artists from various sources, clean and transform the data
- Transformation process includes narrowing data set to US-only artists and dropping columns with data we're not interested in
- Create a new dataframe for artists and popularity measures; this data sets will be mereged with other data sets collected containing similar data
- Create a new dataframe for artists and music genres; this dataframe will be used for further analysis of popular music genres

## Data - Music Artist Popularity
source - https://www.kaggle.com/pieca111/music-artists-popularity

In [291]:
# Read the artists csv file in pandas; convert to dataframe

csv_file = "artists.csv"

artists_df = pd.read_csv(csv_file, low_memory=False)
artists_df = artists_df.drop(columns=['mbid', 'country_lastfm', 'artist_lastfm', 'tags_lastfm','ambiguous_artist'])

# new dataframe with US-only artists

us_df = artists_df.loc[artists_df['country_mb'] == "United States"].dropna()


pop_us_artists = pop_us_artists.rename(columns={"artist_mb" : "Artist",
                                               "tags_mb" : "Genre Tags", 
                                              "listeners_lastfm" : "Listeners_lfm",
                                               "scrobbles_lastfm": "Scrobbles_lfm"})

# pop_us_artists = pop_us_artists.sort_values('listeners_lastfm', ascending=False)
pop_us_artists = pop_us_artists.drop_duplicates()


In [219]:
# new dataframe with artists and popularity measures 
artists = pop_us_artists.drop(columns=["Genre Tags"])
artists.head()

Unnamed: 0,Artist,Listeners_lfm,Scrobbles_lfm
2,Red Hot Chili Peppers,4620835.0,293784041.0
3,Rihanna,4558193.0,199248986.0
4,Eminem,4517997.0,199507511.0
6,Kanye West,4390502.0,238603850.0
7,Nirvana,4272894.0,222303859.0


In [220]:
# new dataframe with artists and genre tags 
genre_tags = pop_us_artists.drop(columns=["Listeners_lfm" , "Scrobbles_lfm"])
genre_tags.head()

Unnamed: 0,Artist,Genre Tags
2,Red Hot Chili Peppers,rock; alternative rock; 80s; 90s; rap; metal; ...
3,Rihanna,pop; dance; hip hop; reggae; contemporary r b;...
4,Eminem,turkish; rap; american; hip-hop; hip hop; hiph...
6,Kanye West,synthpop; pop; american; hip-hop; hip hop; ele...
7,Nirvana,rock; alternative rock; 90s; punk; american; e...


In [221]:
# this is code splits genre_tag string into a list
# pop_us_artists["Genre Tags"] = pop_us_artists["Genre Tags"].str.split(";", n = -1)

In [230]:
# This code creates a dataframe to merge US and Canadian artists incase we must have Justin Beiber in our data set :)

# canada_df = artists_df.loc[artists_df['country_mb'] == "Canada"]
# n_amer = pd.merge(us_df, canada_df, how='outer')

## Data - Billboard-Weekly-Songs-With-Song-And-Artist-Spotify-Popularity
- source https://www.kaggle.com/miteshsingh/hollywood-music-dataset#Hollywood-Music-WCBS-Ranking.csv

In [247]:
# Reading billboard weekly CSV file into pandas; creating new dataframe
bb_as_file = "bb_artists_songs.csv"
bb_as_df = pd.read_csv(bb_as_file)
bb_as_df['Artist'].nunique()
print("There are " + str(len(bb_as_df)) + " rows and " + str(bb_as_df['Artist'].nunique()) + " unique artists in the dataset.")

There are 300600 rows and 9103 unique artists in the dataset.


In [248]:
# create new dataframe for most recent years of data
bb_5 = bb_as_df.loc[bb_as_df["year"] >= 2002]
bb_5.head()
bb_5['Artist'].nunique()
# len(bb_5)
print("There are " + str(len(bb_5)) + " rows and " + str(bb_5['Artist'].nunique()) + " unique artists in the dataset.")

There are 76800 rows and 2864 unique artists in the dataset.


In [266]:
bb_5.head()

Unnamed: 0,date,Rank,Title,Artist,Weeks on chart,year,Spotify_Popularity,Artist_Popularity
223800,2002-01-01,1,How You Remind Me,Nickelback,18.0,2002,77,78
223801,2002-01-01,2,U Got It Bad,Usher,16.0,2002,69,82
223802,2002-01-01,3,Family Affair,Mary J. Blige,24.0,2002,69,72
223803,2002-01-01,4,Get The Party Started,P!nk,11.0,2002,62,83
223804,2002-01-01,5,Always On Time,Ja Rule Featuring Ashanti,8.0,2002,-1,-1


In [256]:
# remove columns and duplicate data; create a new dataframe of artists
bb_artists_pop = bb_5.drop(columns=["date", "Rank", "Weeks on chart", "year", "Title", "Spotify_Popularity"])
bb_artists_pop = bb_artists_pop.sort_values("Artist_Popularity", ascending=False)
bb_artists_pop = bb_artists_pop.drop_duplicates().reset_index(drop=True)
# len(bb_artists_pop)
bb_artists_pop.head()

Unnamed: 0,Artist,Artist_Popularity
0,Drake,100
1,XXXTENTACION,98
2,Ariana Grande,97
3,Post Malone,96
4,Travis Scott,96


In [267]:
# merge two artist dataframes into new dataframe
artist_merge = pd.merge(bb_artists_pop, artists, how='outer', on='Artist')
artist_merge.head()

Unnamed: 0,Artist,Artist_Popularity,Listeners_lfm,Scrobbles_lfm
0,Drake,100.0,,
1,XXXTENTACION,98.0,,
2,Ariana Grande,97.0,1098133.0,106673207.0
3,Post Malone,96.0,,
4,Travis Scott,96.0,,


In [275]:
# Create a new data frame with artists and song titles
bb_artists_songs = bb_5.drop(columns=["date", "Rank", "Weeks on chart", "year", "Spotify_Popularity", "Artist_Popularity"]).sort_values("Artist")
bb_artists_songs = bb_artists_songs.drop_duplicates(subset="Title", keep='first').reset_index(drop=True)
bb_artists_songs.head()

Unnamed: 0,Title,Artist
0,Canadian Idiot,"""Weird Al"" Yankovic"
1,White & Nerdy,"""Weird Al"" Yankovic"
2,Word Crimes,"""Weird Al"" Yankovic"
3,Gone,'N Sync
4,Girlfriend,'N Sync Featuring Nelly


## Data - Billboard-Yearly-Chart-With-Spotify-Popularity-Of-Song-And-Artist

In [260]:
bb_yearly = "billboard-yearly.csv"
bb_yearly_df = pd.read_csv(bb_yearly).sort_values('year')
bb_yearly_df.tail()

Unnamed: 0,year,Rank,Artist,Title,Spotify_Popularity,Artist_Popularity
5227,2012,28,Justin Bieber,Boyfriend,67,91
5226,2012,27,"Kanye West, Big Sean, Pusha T, 2 Chainz",Mercy,-1,-1
5224,2012,25,Jason Mraz,I Won’t Give Up,-1,81
5235,2012,36,Maroon 5 feat. Christina Aguilera,Moves Like Jagger,-1,-1
5299,2012,100,Linkin Park,Burn It Down,71,86


In [280]:
bb_recent_yearly = bb_yearly_df.loc[bb_yearly_df["year"] >= 2002]

bbry_artists = bb_recent_yearly.drop(columns=["year", "Rank", "Title", "Spotify_Popularity"]).drop_duplicates().reset_index(drop=True)
bbry_artists.head()

Unnamed: 0,Artist,Artist_Popularity
0,Brandy,67
1,City High feat. Eve,-1
2,No Doubt feat. Lady Saw,-1
3,OutKast feat. Killer Mike,-1
4,Angie Martinez feat. Lil’ Mo and Sacario,-1


In [289]:
artist_merge2 = pd.merge(artist_merge, bbry_artists, how='outer', on='Artist')
artist_merge2 = artist_merge2.drop(columns=["Artist_Popularity_y"])
artist_merge2 = artist_merge2.fillna(value=-1)
artist_merge2
# len(artist_merge)
# artist_merge2.sort_values("Listeners_lfm")

Unnamed: 0,Artist,Artist_Popularity_x,Listeners_lfm,Scrobbles_lfm
0,Drake,100.0,-1.0,-1.0
1,XXXTENTACION,98.0,-1.0,-1.0
2,Ariana Grande,97.0,1098133.0,106673207.0
3,Post Malone,96.0,-1.0,-1.0
4,Travis Scott,96.0,-1.0,-1.0
5,Khalid,96.0,-1.0,-1.0
6,Nicki Minaj,95.0,2051684.0,69694864.0
7,Eminem,95.0,4517997.0,199507511.0
8,Cardi B,95.0,-1.0,-1.0
9,Imagine Dragons,94.0,1661600.0,81715914.0


12113

In [None]:
# This code is meant to load data from pandas dataframes into MongoDB
# df.to_dict()
# import pymongo

# myclient = pymongo.MongoClient("mongodb://localhost:27017/")
# mydb = myclient["x"] # x stands for database name
# mycol = mydb["y"] # y stands for collection/table name

# x = mycol.insert_many()