In [1]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt
import numpy as np
# Module used to split string with different delimiters
import re

### Extract CSVs into DataFrames

In [2]:
# Reading csv files into python
Artist_File_Path = 'Resources/artistDf.csv'
Billboard_File_Path = 'Resources/billboardHot100_1999-2019_sample.csv'
Grammy_File_Path = 'Resources/grammySongs_1999-2019.csv'

# Making them into dataframe
Adf = pd.read_csv(Artist_File_Path)
Bdf = pd.read_csv(Billboard_File_Path)
Gdf = pd.read_csv(Grammy_File_Path)

In [3]:
def login():
    pw=input("Please enter your password for PGAdmin: ")
    return pw

### Transform DataFrames

In [4]:
# Eliminate extra columns:
#Artist Table
Adf=Adf[['Artist','Genres','NumAlbums','Gender']].copy() 

#Billboard Table
Bdf = Bdf[['Artists', 'Name', 'Peak.position', 'Weeks.on.chart','Week','Genre']].copy()

#Grammy Table
Gdf=Gdf[['Artist','GrammyAward','GrammyYear','Name', 'Genre']].copy()

In [5]:
#changed all strings to lower case
Gdf = Gdf.astype(str).apply(lambda x: x.str.lower())
Adf = Adf.astype(str).apply(lambda x: x.str.lower())
Bdf = Bdf.astype(str).apply(lambda x: x.str.lower())

In [6]:
# Rename the column headers
Adf = Adf.rename(columns={'Artist':'artist', 
                          'Genres': 'genre',
                          'NumAlbums': 'num_albums',
                          'Gender': 'gender',
                          'Group.Solo': 'group_solo'})

# Clean the data by dropping duplicates and setting the index
Adf.drop_duplicates('artist', inplace=True)
Adf.set_index('artist', inplace=True)
Adf

Unnamed: 0_level_0,genre,num_albums,gender
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ed sheeran,"pop,uk pop",8,m
justin bieber,"canadian pop,dance pop,pop,post-teen pop",10,m
jonas brothers,"boy band,dance pop,pop,post-teen pop",10,m
drake,"canadian hip hop,canadian pop,hip hop,pop rap,...",11,m
chris brown,"dance pop,pop,pop rap,r&b,rap",6,m
...,...,...,...
dwight yoakam,"country,country rock,nashville sound,outlaw co...",25,m
jordan knight,,4,m
lila mccann,"contemporary country,country,country dawn",4,f
everlast,rap rock,11,m


In [7]:
#Transform Week column into year
Bdf['Week'] = pd.to_datetime(Bdf['Week'], format = '%Y-%m-%d').map(lambda x: x.strftime('%Y'))

# Rename the column headers
Bdf = Bdf.rename(columns={'Artists':'artist',
                          'Name': 'name_of_song',
                          'Peak.position': 'peak_position',
                          'Weeks.on.chart': 'weeks_on_chart',
                          'Week': 'year',
                          'Genre': 'genre'})


#Split artists at (,) and stack them in separate rows   
Bdf = Bdf.drop('artist', axis=1).join(Bdf['artist'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).rename('artist'))

#Keep only first row when there are artist/song duplicate
Bdf.drop_duplicates(['artist', 'name_of_song'], keep='first', inplace=True)
Bdf['artist'].replace('', np.nan, inplace=True)
Bdf = Bdf.dropna()
Bdf.reset_index(drop=True, inplace=True)
Bdf

Unnamed: 0,name_of_song,peak_position,weeks_on_chart,year,genre,artist
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...","lil nas,"
1,senorita,,,2019,pop,shawn mendes
2,senorita,,,2019,pop,camila cabello
3,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ...",billie eilish
4,talk,3.0,20.0,2019,"synth-pop,pop",khalid
...,...,...,...,...,...,...
2520,kiss me kiss me,,,2014,"australia,punk rock,pop-rock,pop,rock",5 seconds of summer
2521,afire love,,,2014,"rock,uk,pop",ed sheeran
2522,demons,6.0,61.0,2014,"adult alternative,adult contemporary,alternati...",imagine dragons
2523,beat of the music,44.0,20.0,2014,country,brett eldredge


In [8]:
# Rename the column headers
Gdf = Gdf.rename(columns={'Artist':'artist',
                          'GrammyAward': 'grammy_award',
                          'GrammyYear': 'grammy_year',
                          'Name': 'name_of_song',
                          'Genre': 'genre'})


Gdf = Gdf.reset_index()

del Gdf['index']

#Replacing certain words with an empty space
Gdf['artist'] = Gdf['artist'].str.replace('songwriters','')
Gdf['artist'] = Gdf['artist'].str.replace('songwriter','')
Gdf['artist'] = Gdf['artist'].str.replace('artists','')
Gdf['artist'] = Gdf['artist'].str.replace('artist','')
Gdf['artist'] = Gdf['artist'].str.replace('soloists','')
Gdf['artist'] = Gdf['artist'].str.replace('soloist','')
Gdf['artist'] = Gdf['artist'].str.replace('(','')
Gdf['artist'] = Gdf['artist'].str.replace(')','')
Gdf['artist'] = Gdf['artist'].str.replace(u'\xa0', u'')

#Split artists at (,) and stack them in separate rows   
Gdf = Gdf.drop('artist', axis=1).join(Gdf ['artist'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).rename('artist'))
Gdf = Gdf.drop('artist', axis=1).join(Gdf ['artist'].str.split('& |; |, |and |with |featuring',expand=True).stack().reset_index(level=1, drop=True).rename('artist'))
Gdf['artist'].replace('', np.nan, inplace=True)
Gdf.reset_index(drop=True, inplace=True)
Gdf = Gdf.drop_duplicates(['grammy_award','name_of_song','artist'], keep='first')
Gdf

Unnamed: 0,grammy_award,grammy_year,name_of_song,genre,artist
0,record of the year,2018,this is america,general,childish gambino
1,song of the year,2018,this is america,general,childish gambino
2,best pop solo performance,2018,joanne (where do you think you're goin'?),pop,lady gaga
3,best pop duo/group performance,2018,shallow,pop,lady gaga
4,best pop duo/group performance,2018,shallow,pop,bradley cooper
...,...,...,...,...,...
1736,best rap performance by a duo or group,1999,you got me,rap,the roots
1737,best rap performance by a duo or group,1999,you got me,rap,erykah badu
1738,best female country vocal performance,1999,man! i feel like a woman!,country,shania twain
1739,best male country vocal performance,1999,choices,country,george jones


### Merging tables

In [9]:
B_Gdf=pd.merge(Bdf,Gdf,on='name_of_song')
B_Gdf=B_Gdf[['name_of_song', 'peak_position', 'weeks_on_chart', 'year', 'genre_y', 'artist_x', 'grammy_award', 'grammy_year']]
B_Gdf=B_Gdf.drop_duplicates()
B_Gdf=B_Gdf.groupby(['name_of_song','artist_x', 'peak_position', 'year', 'weeks_on_chart'])['grammy_award'].apply(', '.join).reset_index()
B_Gdf.rename(columns={'artist_x':'artist','genre_y':'genre'})

Unnamed: 0,name_of_song,artist,peak_position,year,weeks_on_chart,grammy_award
0,24k magic,bruno mars,4.0,2017,41.0,record of the year
1,again,fetty wap,33.0,2016,26.0,best male rock vocal performance
2,ain't it fun,paramore,10.0,2014,24.0,best rock song
3,alright,kendrick lamar,81.0,2015,14.0,"best rap performance, best rap song"
4,bad romance,lady gaga,2.0,2017,35.0,best female pop vocal performance
...,...,...,...,...,...,...
64,trouble,iggy azalea,67.0,2015,5.0,best female rock vocal performance
65,vertigo,khalid,,2018,,best rock performance by a duo or group with v...
66,woman,kesha,,2017,,best hard rock performance
67,work it,"missy ""misdemeanor"" elliott",2.0,2015,26.0,best female rap solo performance


### Load into database

In [None]:
import psycopg2
pw=login()
connection = None
try:
    # In PostgreSQL, default username is 'postgres' and password is 'postgres'.
    # And also there is a default database exist named as 'postgres'.
    # Default host is 'localhost' or '127.0.0.1'
    # And default port is '54322'.
    connection= psycopg2.connect(
      database='postgres', user='postgres', password=pw, host='127.0.0.1', port= '5432'
    )
    print('Database connected.')

except:
    print('Database not connected.')
#Check is the database exists or not
if connection is not None:
    connection.autocommit = True

    cur = connection.cursor()
    cur.execute("SELECT datname FROM pg_database;")

    list_database = cur.fetchall()
    if ('artist_db',) in list_database:
        print(" Database artist_db already exist")
    else:
        print(" Database artist_db does not exist, creating one right now.......")
        sql = '''CREATE database artist_db'''
        cur.execute(sql)
        print("Database created successfully........")
    connection.close()
    print('Done')


In [None]:
#Create connection
rds_connection_string = f"postgres:{pw}@localhost:5432/artist_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
#Append Dataframs as tables to artist_db
Adf.to_sql(name='artists', con=engine, if_exists='append', index=True)
Bdf.to_sql(name='billboard', con=engine, if_exists='append', index=False)
Gdf.to_sql(name='grammy_songs', con=engine, if_exists='append', index=False)
B_Gdf.to_sql(name='grammy_songs_joined_billboard', con=engine, if_exists='append', index=False)