In [2]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt
import numpy as np

In [3]:
#reading csv files into python
Artist_File_Path='Resources/artistDf.csv'
Billboard_File_Path='Resources/billboardHot100_1999-2019_sample.csv'
Grammy_File_Path='Resources/grammySongs_1999-2019.csv'

Adf=pd.read_csv(Artist_File_Path)
Bdf = pd.read_csv(Billboard_File_Path)
Gdf=pd.read_csv(Grammy_File_Path)


In [4]:
def login():
    pw=input("Please enter your password for PGAdmin: ")
    return pw

In [5]:
#eliminate extra columns
Adf=Adf[['Artist','Genres','NumAlbums','Gender']].copy()
Bdf = Bdf[['Artists', 'Name', 'Peak.position', 'Weeks.on.chart','Week','Genre']].copy()
Gdf=Gdf[['Artist','GrammyAward','GrammyYear','Name', 'Genre']].copy()

In [6]:
#get rid of all na columns
Adf=Adf.dropna()
Bdf=Bdf.dropna()
Gdf=Gdf.dropna()
#changed all strings to lower case
Gdf = Gdf.astype(str).apply(lambda x: x.str.lower())
Adf = Adf.astype(str).apply(lambda x: x.str.lower())
Bdf = Bdf.astype(str).apply(lambda x: x.str.lower())

### Artist dataframe 

In [7]:
# Rename the column headers
Adf = Adf.rename(columns={'Artist':'artist', 
'Genres': 'genre',
'NumAlbums': 'num_albums',
'Gender': 'gender',
'Group.Solo': 'group_solo'})

# Clean the data by dropping duplicates and setting the index
Adf.drop_duplicates('artist', inplace=True)
Adf.set_index('artist', inplace=True)
Adf.head()

Unnamed: 0_level_0,genre,num_albums,gender
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ed sheeran,"pop,uk pop",8,m
justin bieber,"canadian pop,dance pop,pop,post-teen pop",10,m
jonas brothers,"boy band,dance pop,pop,post-teen pop",10,m
drake,"canadian hip hop,canadian pop,hip hop,pop rap,...",11,m
chris brown,"dance pop,pop,pop rap,r&b,rap",6,m


In [8]:
#Checking for null values 
Adf.isnull().sum()  

genre         0
num_albums    0
gender        0
dtype: int64

### Billboard dataframe

In [9]:
Bdf = Bdf.rename(columns={'Artists':'artist',
                          'Name': 'name_of_song',
                          'Peak.position': 'peak_position',
                          'Weeks.on.chart': 'weeks_on_chart',
                          'Week': 'year',
                          'Genre': 'genre'})
Bdf

Unnamed: 0,artist,name_of_song,peak_position,weeks_on_chart,year,genre
0,"lil nas,",old town road,1.0,7.0,2019-07-06,"country,atlanta,alternative country,hip-hop,tr..."
2,billie eilish,bad guy,2.0,13.0,2019-07-06,"hip-hop,dark pop,house,trap,memes,alternative ..."
3,khalid,talk,3.0,20.0,2019-07-06,"synth-pop,pop"
4,"ed sheeran, justin bieber",i don't care,2.0,7.0,2019-07-06,"canada,uk,dance,dance-pop,pop"
5,jonas brothers,sucker,1.0,17.0,2019-07-06,"alternative pop,boy band,teen pop,pop-rock,pop"
...,...,...,...,...,...,...
24995,schoolboy q,studio,75.0,3.0,2014-07-04,"west coast,rap"
24996,dustin lynch,where it's at,76.0,7.0,2014-07-04,"rock,country"
24997,craig campbell,keep them kisses comin',72.0,8.0,2014-07-04,pop
24998,lorde,tennis court,71.0,13.0,2014-07-04,"alternative pop,alternative,downtempo,new zeal..."


In [10]:
#Change year column to datetime
Bdf['year']= pd.to_datetime(Bdf['year'])
#Retain only the year from datetime
Bdf['year'] = Bdf['year'].dt.year
Bdf

Unnamed: 0,artist,name_of_song,peak_position,weeks_on_chart,year,genre
0,"lil nas,",old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr..."
2,billie eilish,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ..."
3,khalid,talk,3.0,20.0,2019,"synth-pop,pop"
4,"ed sheeran, justin bieber",i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop"
5,jonas brothers,sucker,1.0,17.0,2019,"alternative pop,boy band,teen pop,pop-rock,pop"
...,...,...,...,...,...,...
24995,schoolboy q,studio,75.0,3.0,2014,"west coast,rap"
24996,dustin lynch,where it's at,76.0,7.0,2014,"rock,country"
24997,craig campbell,keep them kisses comin',72.0,8.0,2014,pop
24998,lorde,tennis court,71.0,13.0,2014,"alternative pop,alternative,downtempo,new zeal..."


In [11]:
# Check if the 'artists_split' list is greater than 1. If it is, it means that more than one artist is listed for a song.
# Modify original Bdf dataframe with one name in artist colum   
Bdf = Bdf.drop('artist', axis=1).join(Bdf['artist'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('artist'))
Bdf

Unnamed: 0,name_of_song,peak_position,weeks_on_chart,year,genre,artist
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",lil nas
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",
2,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ...",billie eilish
3,talk,3.0,20.0,2019,"synth-pop,pop",khalid
4,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",ed sheeran
...,...,...,...,...,...,...
24995,studio,75.0,3.0,2014,"west coast,rap",schoolboy q
24996,where it's at,76.0,7.0,2014,"rock,country",dustin lynch
24997,keep them kisses comin',72.0,8.0,2014,pop,craig campbell
24998,tennis court,71.0,13.0,2014,"alternative pop,alternative,downtempo,new zeal...",lorde


In [12]:
Bdf.head(50)

Unnamed: 0,name_of_song,peak_position,weeks_on_chart,year,genre,artist
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",lil nas
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",
2,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ...",billie eilish
3,talk,3.0,20.0,2019,"synth-pop,pop",khalid
4,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",ed sheeran
4,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",justin bieber
5,sucker,1.0,17.0,2019,"alternative pop,boy band,teen pop,pop-rock,pop",jonas brothers
6,suge,7.0,13.0,2019,"trap,east coast,rap",dababy
7,money in the grave,7.0,2.0,2019,"hip-hop,rap,basketball,nba,canada",drake
8,no guidance,9.0,3.0,2019,"alternative r&;b,hip-hop,rap,pop,dmv,canada,r&;b",chris brown


In [13]:
#To remove null values, use numpy to replace the empty cells to nan
Bdf['artist'].replace('', np.nan, inplace=True)
Bdf

Unnamed: 0,name_of_song,peak_position,weeks_on_chart,year,genre,artist
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",lil nas
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",
2,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ...",billie eilish
3,talk,3.0,20.0,2019,"synth-pop,pop",khalid
4,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",ed sheeran
...,...,...,...,...,...,...
24995,studio,75.0,3.0,2014,"west coast,rap",schoolboy q
24996,where it's at,76.0,7.0,2014,"rock,country",dustin lynch
24997,keep them kisses comin',72.0,8.0,2014,pop,craig campbell
24998,tennis court,71.0,13.0,2014,"alternative pop,alternative,downtempo,new zeal...",lorde


In [14]:
#Checking for null values 
Bdf.isnull().sum()  

name_of_song       0
peak_position      0
weeks_on_chart     0
year               0
genre              0
artist            20
dtype: int64

In [15]:
Bdf1 = Bdf.dropna()
Bdf1

Unnamed: 0,name_of_song,peak_position,weeks_on_chart,year,genre,artist
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",lil nas
2,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ...",billie eilish
3,talk,3.0,20.0,2019,"synth-pop,pop",khalid
4,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",ed sheeran
4,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",justin bieber
...,...,...,...,...,...,...
24995,studio,75.0,3.0,2014,"west coast,rap",schoolboy q
24996,where it's at,76.0,7.0,2014,"rock,country",dustin lynch
24997,keep them kisses comin',72.0,8.0,2014,pop,craig campbell
24998,tennis court,71.0,13.0,2014,"alternative pop,alternative,downtempo,new zeal...",lorde


In [16]:
#reseting index to remove the repeated index numbers from stacking
Bdf1.reset_index(drop=True, inplace=True)

In [17]:
Bdf1

Unnamed: 0,name_of_song,peak_position,weeks_on_chart,year,genre,artist
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",lil nas
1,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ...",billie eilish
2,talk,3.0,20.0,2019,"synth-pop,pop",khalid
3,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",ed sheeran
4,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",justin bieber
...,...,...,...,...,...,...
25528,studio,75.0,3.0,2014,"west coast,rap",schoolboy q
25529,where it's at,76.0,7.0,2014,"rock,country",dustin lynch
25530,keep them kisses comin',72.0,8.0,2014,pop,craig campbell
25531,tennis court,71.0,13.0,2014,"alternative pop,alternative,downtempo,new zeal...",lorde


In [18]:
#Keep only first row when there are artist/song duplicate

Bdf1.drop_duplicates(['artist', 'name_of_song'], keep='first', inplace=True)
Bdf1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bdf1.drop_duplicates(['artist', 'name_of_song'], keep='first', inplace=True)


Unnamed: 0,name_of_song,peak_position,weeks_on_chart,year,genre,artist
0,old town road,1.0,7.0,2019,"country,atlanta,alternative country,hip-hop,tr...",lil nas
1,bad guy,2.0,13.0,2019,"hip-hop,dark pop,house,trap,memes,alternative ...",billie eilish
2,talk,3.0,20.0,2019,"synth-pop,pop",khalid
3,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",ed sheeran
4,i don't care,2.0,7.0,2019,"canada,uk,dance,dance-pop,pop",justin bieber
...,...,...,...,...,...,...
25458,automatic,35.0,19.0,2014,"rock,country",miranda lambert
25459,wild wild love,30.0,18.0,2014,"pop,rap",pitbull
25503,demons,6.0,61.0,2014,"adult alternative,adult contemporary,alternati...",imagine dragons
25518,beat of the music,44.0,20.0,2014,country,brett eldredge


### Grammy dataframe

In [19]:
# Rename the column headers
Gdf = Gdf.rename(columns={'Artist':'artist',
                        'GrammyAward': 'grammy_award',
                        'GrammyYear': 'grammy_year',
                        'Name': 'name_of_song',
                        'Genre': 'genre'})
Gdf

Unnamed: 0,artist,grammy_award,grammy_year,name_of_song,genre
0,childish gambino,record of the year,2018,this is america,general
1,childish gambino,song of the year,2018,this is america,general
2,lady gaga,best pop solo performance,2018,joanne (where do you think you're goin'?),pop
3,lady gaga & bradley cooper,best pop duo/group performance,2018,shallow,pop
4,silk city & dua lipa featuring diplo & mark ro...,best dance recording,2018,electricity,dance/electronic music
...,...,...,...,...,...
423,eminem,best rap solo performance,1999,my name is,rap
424,the roots featuring erykah badu,best rap performance by a duo or group,1999,you got me,rap
425,shania twain,best female country vocal performance,1999,man! i feel like a woman!,country
426,george jones,best male country vocal performance,1999,choices,country


In [20]:
Gdf  = Gdf .drop('artist', axis=1).join(Gdf ['artist'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('artist'))
Gdf

Unnamed: 0,grammy_award,grammy_year,name_of_song,genre,artist
0,record of the year,2018,this is america,general,childish gambino
1,song of the year,2018,this is america,general,childish gambino
2,best pop solo performance,2018,joanne (where do you think you're goin'?),pop,lady gaga
3,best pop duo/group performance,2018,shallow,pop,lady gaga & bradley cooper
4,best dance recording,2018,electricity,dance/electronic music,silk city & dua lipa featuring diplo & mark ro...
...,...,...,...,...,...
423,best rap solo performance,1999,my name is,rap,eminem
424,best rap performance by a duo or group,1999,you got me,rap,the roots featuring erykah badu
425,best female country vocal performance,1999,man! i feel like a woman!,country,shania twain
426,best male country vocal performance,1999,choices,country,george jones


In [21]:
Gdf['artist'] = Gdf['artist'].str.replace('Songwriters','')

Gdf['artist'] = Gdf['artist'].str.replace('Songwriter','')
Gdf['artist'] = Gdf['artist'].str.replace('Artists','')

Gdf['artist'] = Gdf['artist'].str.replace('Artist','')
Gdf['artist'] = Gdf['artist'].str.replace('Soloists','')
Gdf['artist'] = Gdf['artist'].str.replace('Soloist','')
Gdf['artist'] = Gdf['artist'].str.replace('(','')
Gdf['artist'] = Gdf['artist'].str.replace(')','')
Gdf['artist'] = Gdf['artist'].str.replace(u'\xa0', u'')

In [22]:
Gdf.isnull().sum()   

grammy_award    0
grammy_year     0
name_of_song    0
genre           0
artist          0
dtype: int64

In [23]:
#reseting index to remove the repeated index numbers from stacking
Gdf.reset_index(drop=True, inplace=True)

In [24]:
Gdf

Unnamed: 0,grammy_award,grammy_year,name_of_song,genre,artist
0,record of the year,2018,this is america,general,childish gambino
1,song of the year,2018,this is america,general,childish gambino
2,best pop solo performance,2018,joanne (where do you think you're goin'?),pop,lady gaga
3,best pop duo/group performance,2018,shallow,pop,lady gaga & bradley cooper
4,best dance recording,2018,electricity,dance/electronic music,silk city & dua lipa featuring diplo & mark ro...
...,...,...,...,...,...
530,best rap solo performance,1999,my name is,rap,eminem
531,best rap performance by a duo or group,1999,you got me,rap,the rootsfeaturing erykah badu
532,best female country vocal performance,1999,man! i feel like a woman!,country,shania twain
533,best male country vocal performance,1999,choices,country,george jones


In [25]:
import psycopg2
pw=login()
connection = None
try:
    # In PostgreSQL, default username is 'postgres' and password is 'postgres'.
    # And also there is a default database exist named as 'postgres'.
    # Default host is 'localhost' or '127.0.0.1'
    # And default port is '54322'.
    connection= psycopg2.connect(
      database='postgres', user='postgres', password=pw, host='127.0.0.1', port= '5432'
    )
    print('Database connected.')

except:
    print('Database not connected.')

if connection is not None:
    connection.autocommit = True

    cur = connection.cursor()
    cur.execute("SELECT datname FROM pg_database;")

    list_database = cur.fetchall()
    if ('musician_db',) in list_database:
        print(" Database musician_db already exist")
    else:
        print(" Database musician_db does not exist, creating one right now.......")
        sql = '''CREATE database musician_db'''
        cur.execute(sql)
        print("Database created successfully........")
    connection.close()
    print('Done')


Please enter your password for PGAdmin: 
Database not connected.


In [None]:
rds_connection_string = f"postgres:{pw}@localhost:5432/musician_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
engine.table_names()

In [None]:
artist_newdf.to_sql(name='Artists', con=engine, if_exists='append', index=False)
Bill_board_newdf.to_sql(name='Billboard', con=engine, if_exists='append', index=False)
Grammy_df.to_sql(name='Grammy', con=engine, if_exists='append', index=False)