# Spotify Data 

In [2]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from datetime import timedelta
from config import key

### Data Source
https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks

This data set on Kaggle has 5 csv files, but only the first data set is used. It has song information on music released from 1921 to 2020 that is available on Spotify. It does not include any sort of ranking or number of plays; it is data for the song itself which includes the characteristics or attributes of the song. From the link, we can see what the creator intended for each of the categories and how to parse their data:
    - id (Id of track generated by Spotify)
    - acousticness (Ranges from 0 to 1)
    - danceability (Ranges from 0 to 1)
    - energy (Ranges from 0 to 1)
    - duration_ms (Integer typically ranging from 200k to 300k)
    - instrumentalness (Ranges from 0 to 1)
    - valence (Ranges from 0 to 1)
    - popularity (Ranges from 0 to 100)
    - tempo (Float typically ranging from 50 to 150)
    - liveness (Ranges from 0 to 1)
    - loudness (Float typically ranging from -60 to 0)
    - speechiness (Ranges from 0 to 1)
    - year (Ranges from 1921 to 2020)
    - mode (0 = Minor, 1 = Major)
    - explicit (0 = No explicit content, 1 = Explicit content)
    - key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
    - artists (List of artists mentioned)
    - release_date (Date of release mostly in yyyy-mm-dd format, however precision of date may vary)
    - name (Name of the song)

All of that information was pulled from the Spotify API, and more information can be found here:  
https://developer.spotify.com/documentation/web-api/reference/#endpoint-get-audio-features

In [3]:
#the intiial data set
spotify_path = "../Resources/Spotify/data.csv"
spotify_data = pd.read_csv(spotify_path)
spotify_data

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991000,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.3790,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.6340,1920
1,0.643000,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.026400,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.9500,1920
2,0.993000,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,0.000018,0,0.5190,-12.098,1,Golfing Papa,4,1920,0.1740,97.600,0.6890,1920
3,0.000173,['Oscar Velazquez'],0.730,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801000,2,0.1280,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295000,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.4020,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.2990,1920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174384,0.009170,"['DJ Combo', 'Sander-7', 'Tony T']",0.792,147615,0.866,0,46LhBf6TvYjZU2SMvGZAbn,0.000060,6,0.1780,-5.089,0,The One,0,2020-12-25,0.0356,125.972,0.1860,2020
174385,0.795000,['Alessia Cara'],0.429,144720,0.211,0,7tue2Wemjd0FZzRtDrQFZd,0.000000,4,0.1960,-11.665,1,A Little More,0,2021-01-22,0.0360,94.710,0.2280,2021
174386,0.806000,['Roger Fly'],0.671,218147,0.589,0,48Qj61hOdYmUCFJbpQ29Ob,0.920000,4,0.1130,-12.393,0,Together,0,2020-12-09,0.0282,108.058,0.7140,2020
174387,0.920000,['Taylor Swift'],0.462,244000,0.240,1,1gcyHQpBQ1lfXGdhZmWrHP,0.000000,0,0.1130,-12.077,1,champagne problems,69,2021-01-07,0.0377,171.319,0.3200,2021


The next three lines of code were to strip all the brackets, single and double quotes from the song titles

In [4]:
spotify_data['artists'] = spotify_data['artists'].str.strip('[]')

In [5]:
spotify_data['artists'] = spotify_data['artists'].str.strip("''")

In [6]:
spotify_data['artists'] = spotify_data['artists'].str.strip('""')

In [7]:
#checking to see that the strip worked
spotify_data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,Mamie Smith,0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,Screamin' Jay Hawkins,0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,Mamie Smith,0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,Oscar Velazquez,0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,Mixe,0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


The id was changed to spotify_song_id for clarity later when the other data sets get added. The name was changed to song_title to work with the other data sets

In [8]:
spotify_df = spotify_data.rename(columns={"id": "spotify_song_id", "name": "song_title"})
spotify_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,spotify_song_id,instrumentalness,key,liveness,loudness,mode,song_title,popularity,release_date,speechiness,tempo,valence,year
0,0.991,Mamie Smith,0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,Screamin' Jay Hawkins,0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,Mamie Smith,0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,Oscar Velazquez,0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,Mixe,0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


The columns were re-organized to be more readable

In [10]:
spotify_df = spotify_df[["spotify_song_id","year","artists","song_title","release_date", "duration_ms", "acousticness", "danceability","energy","explicit", "instrumentalness", "key", "liveness", "loudness", "mode", "popularity", "speechiness", "tempo", "valence"]]
spotify_df.head()

Unnamed: 0,spotify_song_id,year,artists,song_title,release_date,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
0,0cS0A1fUEUd1EW3FcF8AEI,1920,Mamie Smith,Keep A Song In Your Soul,1920,168333,0.991,0.598,0.224,0,0.000522,5,0.379,-12.628,0,12,0.0936,149.976,0.634
1,0hbkKFIJm7Z05H8Zl9w30f,1920,Screamin' Jay Hawkins,I Put A Spell On You,1920-01-05,150200,0.643,0.852,0.517,0,0.0264,5,0.0809,-7.261,0,7,0.0534,86.889,0.95
2,11m7laMUgmOKqI3oYzuhne,1920,Mamie Smith,Golfing Papa,1920,163827,0.993,0.647,0.186,0,1.8e-05,0,0.519,-12.098,1,4,0.174,97.6,0.689
3,19Lc5SfJJ5O1oaxY0fpwfh,1920,Oscar Velazquez,True House Music - Xavier Santos & Carlos Gomi...,1920-01-01,422087,0.000173,0.73,0.798,0,0.801,2,0.128,-7.311,1,17,0.0425,127.997,0.0422
4,2hJjbsLCytGsnAHfdsLejp,1920,Mixe,Xuniverxe,1920-10-01,165224,0.295,0.704,0.707,1,0.000246,10,0.402,-6.036,0,2,0.0768,122.076,0.299


We decided to focus on only 10 years worth of data, so the spotify list was sorted to songs released after 2008. The other data sets were from 2009 to 2019, but the Spotify list was set to include 2008. This decision was made due to the fact that the songs on the Billboard chart at the beginning of 2009 would be from 2008. The cut off year of 2020 was given as well as the Grammy and Billboard data only went to 2019

In [17]:
spotify_last_10years = spotify_df.loc[(spotify_df["year"] >= 2008) & (spotify_df["year"] < 2020)]
spotify_last_10years

Unnamed: 0,spotify_song_id,year,artists,song_title,release_date,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
9071,6catF1lDhNTjjGa2GxRQNN,2008,Gerry & The Pacemakers,You'll Never Walk Alone - Mono; 2002 Remaster,2008-02-11,160187,0.39400,0.484,0.265,0,0.000000,0,0.1490,-11.101,1,55,0.0322,113.564,0.285
9081,4aSw1QJIMwYSoDEgzgdCJL,2008,Gerry & The Pacemakers,Ferry Cross the Mersey - Mono; 2002 Remaster,2008-02-11,141987,0.25500,0.405,0.365,0,0.000005,6,0.1630,-10.226,0,39,0.0289,104.536,0.588
9085,0ZMMtH875IR2TfkyC4PolD,2008,Gerry & The Pacemakers,Don't Let the Sun Catch You Crying (Main) - Mono,2008-02-11,157093,0.40600,0.477,0.352,0,0.000000,1,0.1220,-14.165,1,34,0.0300,106.773,0.478
9087,1hx7X9cMXHWJjknb9O6Ava,2018,Frank Sinatra,The September Of My Years - Live At The Sands ...,2018-05-04,187333,0.88700,0.319,0.201,0,0.000000,7,0.9040,-17.796,1,27,0.0623,117.153,0.239
9091,19oquvXf3bc65GSqtPYA5S,2018,Frank Sinatra,It Was A Very Good Year - Live At The Sands Ho...,2018-05-04,236800,0.93800,0.269,0.129,0,0.000005,7,0.6830,-18.168,0,26,0.0576,82.332,0.160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174116,4ZzfKwqPfy02fpQxuh9Zz3,2019,Workout Music,RITMO (Bay Boys For Life) [Remix],2019-12-26,277493,0.01380,0.734,0.809,0,0.012600,0,0.0635,-5.655,0,14,0.1150,128.015,0.965
174118,3O1lskEk6MF8bEQ76Q7Cmv,2019,Workout Music,Beautiful People (Remix),2019-12-26,287987,0.06540,0.533,0.879,0,0.000000,5,0.0147,-3.736,1,14,0.0492,129.943,0.398
174120,4fBQiIB6X7sE5RWK7mZq8N,2019,Workout Music,Tusa (Remix),2019-12-26,268120,0.00358,0.764,0.828,0,0.005500,2,0.0763,-3.625,1,16,0.0810,127.994,0.931
174124,1g6Zm6BiN3WktHGzu0ZKAh,2013,October,A Good Year - 2020 Remastered,2013,226603,0.98100,0.520,0.198,0,0.909000,1,0.1020,-23.823,1,5,0.0483,123.976,0.272


The column for duration_ms was going to be changed to a time, so that it was easier to understand. However, when attempting to import the data into SQL, it did not like the format. SQL attempted to force it back to ms. The conversion messed up duration as now every value for duration had an extra four zeroes on the end. This change was scrapped. 


In [18]:
#unused duration to time change
# spotify_last_10years["duration_ms"] = pd.to_timedelta(spotify_last_10years["duration_ms"], unit='ms')
#spotify_last_10years.head()

Each song has a unqiue Spotify song id, and it was presumed that each ID was only used once. However, after some data exploration and issues with SQL, it was discovered that there were in fact duplicates. A group by the song_id, showed there were several duplicated song_ids listed. 

In [19]:
groups = spotify_last_10years.groupby(['spotify_song_id'])
groups.count()

Unnamed: 0_level_0,year,artists,song_title,release_date,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
spotify_song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0025JMWRhsWx0GXdlzhHMO,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
002sGwDZYna3CKXbYIilHz,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
005lwxGU1tms6HGELIcUv9,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
006bxORtP7mtwDtULXaQqG,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
009ImBOrIUlWgla8U05RAC,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7zvFLwOTZX18ImGiKrfyzw,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7zwn1eykZtZ5LODrf7c0tS,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7zxRMhXxJMQCeDDg0rKAVo,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7zxv7kFipfmvpDiC1eU4Fb,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


A drop duplicates function was used on the spotify_song_id to get rid of all duplicated songs. 

In [20]:
spotify_last_10years = spotify_last_10years.drop_duplicates(subset=['spotify_song_id'])
spotify_last_10years

Unnamed: 0,spotify_song_id,year,artists,song_title,release_date,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
9071,6catF1lDhNTjjGa2GxRQNN,2008,Gerry & The Pacemakers,You'll Never Walk Alone - Mono; 2002 Remaster,2008-02-11,160187,0.39400,0.484,0.265,0,0.000000,0,0.1490,-11.101,1,55,0.0322,113.564,0.285
9081,4aSw1QJIMwYSoDEgzgdCJL,2008,Gerry & The Pacemakers,Ferry Cross the Mersey - Mono; 2002 Remaster,2008-02-11,141987,0.25500,0.405,0.365,0,0.000005,6,0.1630,-10.226,0,39,0.0289,104.536,0.588
9085,0ZMMtH875IR2TfkyC4PolD,2008,Gerry & The Pacemakers,Don't Let the Sun Catch You Crying (Main) - Mono,2008-02-11,157093,0.40600,0.477,0.352,0,0.000000,1,0.1220,-14.165,1,34,0.0300,106.773,0.478
9087,1hx7X9cMXHWJjknb9O6Ava,2018,Frank Sinatra,The September Of My Years - Live At The Sands ...,2018-05-04,187333,0.88700,0.319,0.201,0,0.000000,7,0.9040,-17.796,1,27,0.0623,117.153,0.239
9091,19oquvXf3bc65GSqtPYA5S,2018,Frank Sinatra,It Was A Very Good Year - Live At The Sands Ho...,2018-05-04,236800,0.93800,0.269,0.129,0,0.000005,7,0.6830,-18.168,0,26,0.0576,82.332,0.160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174116,4ZzfKwqPfy02fpQxuh9Zz3,2019,Workout Music,RITMO (Bay Boys For Life) [Remix],2019-12-26,277493,0.01380,0.734,0.809,0,0.012600,0,0.0635,-5.655,0,14,0.1150,128.015,0.965
174118,3O1lskEk6MF8bEQ76Q7Cmv,2019,Workout Music,Beautiful People (Remix),2019-12-26,287987,0.06540,0.533,0.879,0,0.000000,5,0.0147,-3.736,1,14,0.0492,129.943,0.398
174120,4fBQiIB6X7sE5RWK7mZq8N,2019,Workout Music,Tusa (Remix),2019-12-26,268120,0.00358,0.764,0.828,0,0.005500,2,0.0763,-3.625,1,16,0.0810,127.994,0.931
174124,1g6Zm6BiN3WktHGzu0ZKAh,2013,October,A Good Year - 2020 Remastered,2013,226603,0.98100,0.520,0.198,0,0.909000,1,0.1020,-23.823,1,5,0.0483,123.976,0.272


A list of the columns was made to assist with putting all of the data into SQL. 

In [21]:
list(spotify_last_10years.columns)

['spotify_song_id',
 'year',
 'artists',
 'song_title',
 'release_date',
 'duration_ms',
 'acousticness',
 'danceability',
 'energy',
 'explicit',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'popularity',
 'speechiness',
 'tempo',
 'valence']

In [22]:
#Connection to SQL is made using music_db as the database
rds_connection_string = f"{key}@localhost:5432/music_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [23]:
#finding the table made in SQL
engine.table_names()

['spotify']

In [25]:
#adding the data to the table in SQL
spotify_last_10years.to_sql(name='spotify', con=engine, if_exists='append', index=False)

In [26]:
#verifying the data was made in SQL
pd.read_sql_query('select * from spotify', con=engine).head()

Unnamed: 0,spotify_song_id,year,artists,song_title,release_date,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
0,6catF1lDhNTjjGa2GxRQNN,2008,Gerry & The Pacemakers,You'll Never Walk Alone - Mono; 2002 Remaster,2008-02-11,160187,0.394,0.484,0.265,0,0.0,0.0,0.149,-11.101,1,55,0.0322,113.564,0.285
1,4aSw1QJIMwYSoDEgzgdCJL,2008,Gerry & The Pacemakers,Ferry Cross the Mersey - Mono; 2002 Remaster,2008-02-11,141987,0.255,0.405,0.365,0,5e-06,6.0,0.163,-10.226,0,39,0.0289,104.536,0.588
2,0ZMMtH875IR2TfkyC4PolD,2008,Gerry & The Pacemakers,Don't Let the Sun Catch You Crying (Main) - Mono,2008-02-11,157093,0.406,0.477,0.352,0,0.0,1.0,0.122,-14.165,1,34,0.03,106.773,0.478
3,1hx7X9cMXHWJjknb9O6Ava,2018,Frank Sinatra,The September Of My Years - Live At The Sands ...,2018-05-04,187333,0.887,0.319,0.201,0,0.0,7.0,0.904,-17.796,1,27,0.0623,117.153,0.239
4,19oquvXf3bc65GSqtPYA5S,2018,Frank Sinatra,It Was A Very Good Year - Live At The Sands Ho...,2018-05-04,236800,0.938,0.269,0.129,0,5e-06,7.0,0.683,-18.168,0,26,0.0576,82.332,0.16
