In [31]:
import pandas as pd
from sqlalchemy import create_engine
from config import pw
import datetime as dt

### Extract CSVs into DataFrames

In [32]:
# Extraxting 'artists' csv file and put it into Pandas DataFrame
artists_file = "Resources/artistDf.csv"
artists_df = pd.read_csv(artists_file)
artists_df.head()

Unnamed: 0,X,Artist,Followers,Genres,NumAlbums,YearFirstAlbum,Gender,Group.Solo
0,0,Ed Sheeran,52698756,"pop,uk pop",8,2011,M,Solo
1,1,Justin Bieber,30711450,"canadian pop,dance pop,pop,post-teen pop",10,2009,M,Solo
2,2,Jonas Brothers,3069527,"boy band,dance pop,pop,post-teen pop",10,2006,M,Group
3,3,Drake,41420478,"canadian hip hop,canadian pop,hip hop,pop rap,...",11,2010,M,Solo
4,4,Chris Brown,9676862,"dance pop,pop,pop rap,r&b,rap",6,2005,M,Solo


In [33]:
# Extraxting 'billboard' csv file and put it into Pandas DataFrame
billboard_file = "Resources/billboardHot100_1999-2019.csv"
billboard_df = pd.read_csv(billboard_file)
billboard_df.head()

Unnamed: 0.1,Unnamed: 0,Artists,Name,Weekly.rank,Peak.position,Weeks.on.chart,Week,Date,Genre,Writing.Credits,Lyrics,Features
0,1,Lil Nas X,Old Town Road,1,1.0,7.0,7/6/2019,5-Apr-19,"Country,Atlanta,Alternative Country,Hip-Hop,Tr...","Jozzy, Atticus ross, Trent reznor, Billy ray c...","Old Town Road Remix \nOh, oh-oh\nOh\nYeah, I'm...",Billy Ray Cyrus
1,2,"Shawn Mendes, Camila Cabello",Senorita,2,,,7/6/2019,21-Jun-19,Pop,"Cashmere cat, Jack patterson, Charli xcx, Benn...",Senorita \nI love it when you call me senorita...,
2,3,Billie Eilish,Bad Guy,3,2.0,13.0,7/6/2019,29-Mar-19,"Hip-Hop,Dark Pop,House,Trap,Memes,Alternative ...","Billie eilish, Finneas","bad guy \nWhite shirt now red, my bloody nose\...",
3,4,Khalid,Talk,4,3.0,20.0,7/6/2019,7-Feb-19,"Synth-Pop,Pop","Howard lawrence, Guy lawrence, Khalid",Talk \nCan we just talk? Can we just talk?\nTa...,
4,5,"Ed Sheeran, Justin Bieber",I Don't Care,5,2.0,7.0,7/6/2019,10-May-19,"Canada,UK,Dance,Dance-Pop,Pop","Ed sheeran, Justin bieber, Shellback, Max mart...",I Don't Care \nI'm at a party I don't wanna be...,


In [34]:
# Extraxting 'grammy songs' csv file and put it into Pandas DataFrame
grammy_songs_file = "Resources/grammySongs_1999-2019.csv"
grammy_songs_df = pd.read_csv(grammy_songs_file)
grammy_songs_df.head()

Unnamed: 0.1,Unnamed: 0,X,GrammyAward,GrammyYear,Genre,Name,Artist
0,1,0,Record Of The Year,2018,General,this is America,Childish Gambino
1,2,1,Song Of The Year,2018,General,this is America,Childish Gambino
2,3,2,Best Pop Solo Performance,2018,Pop,Joanne (where Do you Think You're Goin'?),Lady Gaga
3,4,3,Best Pop Duo/Group Performance,2018,Pop,Shallow,Lady Gaga & Bradley Cooper
4,5,4,Best Dance Recording,2018,Dance/Electronic Music,Electricity,Silk City & Dua Lipa Featuring Diplo & Mark Ro...


### Transform artists DataFrame

In [35]:
# Create a filtered dataframe from specific columns
artists_cols = ['Artist', 'Genres', 'NumAlbums', 'Gender', 'Group.Solo']
artists_transformed = artists_df[artists_cols].copy()

# Rename the column headers
artists_transformed = artists_transformed.rename(columns={'Artist':'artist', 
                                                          'Genres': 'genre',
                                                          'NumAlbums': 'num_albums',
                                                          'Gender': 'gender',
                                                          'Group.Solo': 'group_solo'})

# Transform all strings to lower case
artists_transformed = artists_transformed.astype(str).apply(lambda x: x.str.lower())

# Clean the data by dropping duplicates and setting the index
artists_transformed.drop_duplicates('artist', inplace=True)
artists_transformed.set_index('artist', inplace=True)

artists_transformed.head()

Unnamed: 0_level_0,genre,num_albums,gender,group_solo
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ed sheeran,"pop,uk pop",8,m,solo
justin bieber,"canadian pop,dance pop,pop,post-teen pop",10,m,solo
jonas brothers,"boy band,dance pop,pop,post-teen pop",10,m,group
drake,"canadian hip hop,canadian pop,hip hop,pop rap,...",11,m,solo
chris brown,"dance pop,pop,pop rap,r&b,rap",6,m,solo


### Transform billboard DataFrame

In [6]:
# Create a filtered dataframe from specific columns
billboard_cols = ['Artists', 'Name', 'Peak.position', 'Weeks.on.chart','Week','Genre']
billboard_transformed = billboard_df[billboard_cols].copy()
billboard_transformed['Week'] = pd.to_datetime(billboard_transformed['Week'], format = '%m/%d/%Y').map(lambda x: x.strftime('%Y'))

# Rename the column headers
billboard_transformed = billboard_transformed.rename(columns={'Artists':'artist',
                                                              'Name': 'name_of_song',
                                                              'Peak.position': 'peak_position',
                                                              'Weeks.on.chart': 'weeks_on_chart',
                                                              'Week': 'year',
                                                              'Genre': 'genre'})

# Transform all strings to lower case
billboard_transformed = billboard_transformed.astype(str).apply(lambda x: x.str.lower())


for index in range(len(billboard_transformed)):
    # Assign artists into a variable
    artist_name = billboard_transformed['artist'][index]
    # Split artists
    artists_split = artist_name.split(', ')
    
    # Check if the 'artists_split' list is greater than 1. If it is, it means that more than one artist is listed for a song.
    if (len(artists_split)>1):
        # Modify original df billboard_transformed with one name in artist colum
        billboard_transformed['artist'][index]=artists_split[0]
        # Store row to be splitted in multiple rows in read_row (used to create the row for billboard_transformed_artist_split)
        read_row = billboard_transformed.iloc[index]
        
        # read each artist in the list and create a row for him/sher
        for index_artist in range(len(artists_split)):
            billboard_transformed.loc[len(billboard_transformed.index)] = [artists_split[index_artist], 
                                                                           read_row['name_of_song'],
                                                                           read_row['peak_position'],
                                                                           read_row['weeks_on_chart'], 
                                                                           read_row['year'],
                                                                           read_row['genre']]


#Keep only first row when there are artist/song duplicate
billboard_transformed.drop_duplicates(['artist', 'name_of_song'], keep='first', inplace=True)
billboard_transformed

Unnamed: 0,artist,name_of_song,peak_position,weeks_on_chart,year,genre
0,lil nas x,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr..."
1,shawn mendes,senorita,,,2019,pop
2,billie eilish,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ..."
3,khalid,talk,3.0,20.0,2019,"synth-pop,pop"
4,ed sheeran,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop"
...,...,...,...,...,...,...
106762,case,the best man i can be,77.0,12.0,2000,"soul,ballad,soundtrack,r&;b"
106804,gloria estefan,music of my heart,2.0,20.0,2000,pop
106832,nokio,what ya want,29.0,20.0,1999,rap
106842,dunn,missing you,75.0,3.0,1999,country


### Transform grammy DataFrame

In [46]:
# Create a filtered dataframe from specific columns
grammy_songs_cols = ['Artist','GrammyAward','GrammyYear','Name', 'Genre']
grammy_songs_transformed = grammy_songs_df[grammy_songs_cols].copy()
# Rename the column headers
grammy_songs_transformed = grammy_songs_transformed.rename(columns={'Artist':'artist',
                                                                    'GrammyAward': 'grammy_award',
                                                                    'GrammyYear': 'grammy_year',
                                                                    'Name': 'name_of_song',
                                                                    'Genre': 'genre'})
# Transform all strings to lower case
grammy_songs_transformed = grammy_songs_transformed.astype(str).apply(lambda x: x.str.lower())
grammy_songs_transformed = grammy_songs_transformed.dropna()
grammy_songs_transformed = grammy_songs_transformed.reset_index()
del grammy_songs_transformed['index']
# Module used to split string with different delimiters
import re
for index, songs in grammy_songs_transformed.iterrows():
    # Get artist names
    artists = songs['artist']
    # Remove unnecessary strings
    artists = artists.replace('Songwriters','')
    artists = artists.replace('Songwriter','')
    artists = artists.replace('Artists','')
    artists = artists.replace('Artist','')
    artists = artists.replace('Soloists','')
    artists = artists.replace('Soloist','')
    artists = artists.replace('(','')
    artists = artists.replace(')','')
    artists = artists.replace(u'\xa0', u'')
    # Create list of artists
    artists_list = re.split("& |; |, |and |Featuring |with |featuring ",artists)
    for artist_index in range(len(artists_list)):
        # Remove with spaces in front and back for artist names
        artists_list[artist_index] = artists_list[artist_index].strip()
        # Remove blank names that we created as a result of the strip function above
        if artists_list[artist_index] == "":
            artists_list.pop(artist_index)
    # Update grammy_songs_transformed with spitted artists
    if len(artists_list)>1:
        grammy_songs_transformed.iloc[index,0]=artists_list[0]
        for name in artists_list:
            grammy_songs_transformed.loc[len(grammy_songs_transformed.index)]=[name,
                                                                               songs['grammy_award'],
                                                                                songs['grammy_year'],
                                                                               songs['name_of_song'],
                                                                               songs['genre']]
grammy_songs_transformed = grammy_songs_transformed.drop_duplicates(['grammy_award','name_of_song','artist'], keep='first')
grammy_songs_transformed

Unnamed: 0,artist,grammy_award,grammy_year,name_of_song,genre
0,childish gambino,record of the year,2018,this is america,general
1,childish gambino,song of the year,2018,this is america,general
2,lady gaga,best pop solo performance,2018,joanne (where do you think you're goin'?),pop
3,lady gaga,best pop duo/group performance,2018,shallow,pop
4,silk city,best dance recording,2018,electricity,dance/electronic music
...,...,...,...,...,...
779,rob thomas,record of the year,1999,smooth,general
781,rob thomas,song of the year,1999,smooth,general
783,rob thomas,best pop collaboration with vocals,1999,smooth,pop
785,everlast,best rock performance by a duo or group with v...,1999,put your lights on,rock


### Create database connection

In [47]:
connection_string = f'postgres:{pw}@localhost:5432/artists_db'
engine = create_engine(f'postgresql://{connection_string}')

In [48]:
# Confirm tables
engine.table_names()

['artists', 'billboard', 'grammy_songs']

### Load DataFrames into database

In [126]:
artists_transformed.to_sql(name='artists', con=engine, if_exists='append', index=True)

In [131]:
billboard_transformed.to_sql(name='billboard', con=engine, if_exists='append', index=False)

In [132]:
grammy_songs_clean.to_sql(name='grammy_songs', con=engine, if_exists='append', index=False)