In [14]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt
import numpy as np
# Module used to split string with different delimiters
import re


In [15]:
#reading csv files into python
Artist_File_Path='Resources/artistDf.csv'
Billboard_File_Path='Resources/billboardHot100_1999-2019_sample.csv'
Grammy_File_Path='Resources/grammySongs_1999-2019.csv'

Adf=pd.read_csv(Artist_File_Path)
Bdf = pd.read_csv(Billboard_File_Path)
Gdf=pd.read_csv(Grammy_File_Path)


In [16]:
def login():
    pw=input("Please enter your password for PGAdmin: ")
    return pw

In [17]:
#eliminate extra columns
#Artist Table
Adf=Adf[['Artist','Followers','Genres','NumAlbums','Gender']].copy()
#Billboard Table
Bdf = Bdf[['Artists', 'Name', 'Peak.position', 'Weeks.on.chart','Week','Genre']].copy()
#Grammy Table
Gdf=Gdf[['Artist','GrammyAward','GrammyYear','Name', 'Genre']].copy()

In [18]:
Gdf

Unnamed: 0,Artist,GrammyAward,GrammyYear,Name,Genre
0,Childish Gambino,Record Of The Year,2018,this is America,General
1,Childish Gambino,Song Of The Year,2018,this is America,General
2,Lady Gaga,Best Pop Solo Performance,2018,Joanne (where Do you Think You're Goin'?),Pop
3,Lady Gaga & Bradley Cooper,Best Pop Duo/Group Performance,2018,Shallow,Pop
4,Silk City & Dua Lipa Featuring Diplo & Mark Ro...,Best Dance Recording,2018,Electricity,Dance/Electronic Music
...,...,...,...,...,...
423,Eminem,Best Rap Solo Performance,1999,My Name is,Rap
424,The roots featuring Erykah Badu,Best Rap Performance By A Duo Or Group,1999,you Got Me,Rap
425,Shania Twain,Best Female Country Vocal Performance,1999,Man! i Feel Like a Woman!,Country
426,George Jones,Best Male Country Vocal Performance,1999,Choices,Country


In [19]:
#get rid of all na columns
Adf=Adf.dropna()
Bdf=Bdf.dropna()
Gdf=Gdf.dropna()
#changed all strings to lower case
Gdf = Gdf.astype(str).apply(lambda x: x.str.lower())
Adf = Adf.astype(str).apply(lambda x: x.str.lower())
Bdf = Bdf.astype(str).apply(lambda x: x.str.lower())

In [20]:
# Rename the column headers
Adf = Adf.rename(columns={'Artist':'artist', 
'Genres': 'genre',
'NumAlbums': 'num_albums',
'Gender': 'gender',
'Group.Solo': 'group_solo'})

# Clean the data by dropping duplicates and setting the index
Adf.drop_duplicates('artist', inplace=True)
Adf.set_index('artist', inplace=True)
Adf.head()

Unnamed: 0_level_0,Followers,genre,num_albums,gender
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ed sheeran,52698756,"pop,uk pop",8,m
justin bieber,30711450,"canadian pop,dance pop,pop,post-teen pop",10,m
jonas brothers,3069527,"boy band,dance pop,pop,post-teen pop",10,m
drake,41420478,"canadian hip hop,canadian pop,hip hop,pop rap,...",11,m
chris brown,9676862,"dance pop,pop,pop rap,r&b,rap",6,m


In [21]:
Bdf['Week'] = pd.to_datetime(Bdf['Week'], format = '%Y-%m-%d').map(lambda x: x.strftime('%Y'))

# Rename the column headers
Bdf = Bdf.rename(columns={'Artists':'artist',
                                                              'Name': 'name_of_song',
                                                              'Peak.position': 'peak_position',
                                                              'Weeks.on.chart': 'weeks_on_chart',
                                                              'Week': 'year',
                                                              'Genre': 'genre'})


#Split artists at (,) and stack them in separate rows   
Bdf = Bdf.drop('artist', axis=1).join(Bdf['artist'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('artist'))

#Keep only first row when there are artist/song duplicate
Bdf.drop_duplicates(['artist', 'name_of_song'], keep='first', inplace=True)
Bdf['artist'].replace('', np.nan, inplace=True)
Bdf = Bdf.dropna()
Bdf.reset_index(drop=True, inplace=True)

In [22]:
# Rename the column headers
Gdf = Gdf.rename(columns={'Artist':'artist',
                                                                    'GrammyAward': 'grammy_award',
                                                                    'GrammyYear': 'grammy_year',
                                                                    'Name': 'name_of_song',
                                                                    'Genre': 'genre'})


Gdf = Gdf.reset_index()

del Gdf['index']


Gdf  = Gdf .drop('artist', axis=1).join(Gdf ['artist'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('artist'))
Gdf  = Gdf .drop('artist', axis=1).join(Gdf ['artist'].str.split('& |; |, |and |Featuring |with |featuring',expand=True).stack().reset_index(level=1, drop=True).rename('artist'))
Gdf['artist'].replace('', np.nan, inplace=True)
Gdf.reset_index(drop=True, inplace=True)
Gdf = Gdf.drop_duplicates(['grammy_award','name_of_song','artist'], keep='first')
Gdf

Unnamed: 0,grammy_award,grammy_year,name_of_song,genre,artist
0,record of the year,2018,this is america,general,childish gambino
1,song of the year,2018,this is america,general,childish gambino
2,best pop solo performance,2018,joanne (where do you think you're goin'?),pop,lady gaga
3,best pop duo/group performance,2018,shallow,pop,lady gaga
4,best pop duo/group performance,2018,shallow,pop,bradley cooper
...,...,...,...,...,...
1725,best rap performance by a duo or group,1999,you got me,rap,the roots
1726,best rap performance by a duo or group,1999,you got me,rap,erykah badu
1727,best female country vocal performance,1999,man! i feel like a woman!,country,shania twain
1728,best male country vocal performance,1999,choices,country,george jones


In [23]:
import psycopg2
pw=login()
connection = None
try:
    # In PostgreSQL, default username is 'postgres' and password is 'postgres'.
    # And also there is a default database exist named as 'postgres'.
    # Default host is 'localhost' or '127.0.0.1'
    # And default port is '54322'.
    connection= psycopg2.connect(
      database='postgres', user='postgres', password=pw, host='127.0.0.1', port= '5432'
    )
    print('Database connected.')

except:
    print('Database not connected.')

if connection is not None:
    connection.autocommit = True

    cur = connection.cursor()
    cur.execute("SELECT datname FROM pg_database;")

    list_database = cur.fetchall()
    if ('artist_db',) in list_database:
        print(" Database artist_db already exist")
    else:
        print(" Database artist_db does not exist, creating one right now.......")
        sql = '''CREATE database artist_db'''
        cur.execute(sql)
        print("Database created successfully........")
    connection.close()
    print('Done')


Database connected.
 Database artist_db already exist
Done


In [24]:

rds_connection_string = f"postgres:{pw}@localhost:5432/artist_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [25]:
engine.table_names()

['artists', 'billboard', 'grammy_songs']

In [26]:
Adf.to_sql(name='artists', con=engine, if_exists='append', index=False)
Bdf.to_sql(name='billboard', con=engine, if_exists='append', index=False)
Gdf.to_sql(name='grammy_songs', con=engine, if_exists='append', index=False)
