In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

# ELC

## Loading data

In [4]:
from google.auth import load_credentials_from_file
from google.cloud.bigquery import Client

In [5]:
credentials, project_id = load_credentials_from_file('service_account.json')

In [6]:
# Load data from BigQuery
client = Client(
    project = project_id,
    credentials=credentials
)

In [7]:
query = "SELECT * FROM `da26-python.music_data.tracks`"

In [8]:
load_job = client.query(query)

In [9]:
data = load_job.to_dataframe()



In [10]:
def load_data(table):
    query = f"SELECT * FROM `da26-python.music_data.{table}`"
    load_job = client.query(query)
    data = load_job.to_dataframe()
    return data

In [11]:
artists = load_data('artists')

In [12]:
audio_features = load_data('audio_features')

In [13]:
chart_positions = load_data('chart_positions')

In [14]:
mapping = load_data('tracks_artists_mapping')

In [15]:
tracks = load_data('tracks')

## Joining data

Joining "tracks", "artists" 

In [17]:
data = tracks.merge(mapping, on = 'track_id' )

In [18]:
data = data.merge(artists, on = 'artist_id')

In [19]:
data.rename(columns = {'name_x':'track_name', 'name_y':'artist'}, inplace = True)

In [20]:
data.head()

Unnamed: 0,track_id,track_name,duration_ms,release_date,album_type,explicit,artist_id,artist,popularity,followers
0,5CMVGP24paZIukljDj0iWc,Jingle Bells (with The Ken Lane Singers),156666,1945,compilation,False,0TPlgv7eySSCChc2vKRIml,Ken Lane Singers,28,151
1,5CMVGP24paZIukljDj0iWc,Jingle Bells (with The Ken Lane Singers),156666,1945,compilation,False,1Mxqyy3pSjf8kZZL4QVxS0,Frank Sinatra,79,7480976
2,25leEEaz1gIpp7o21Fqyjo,Here Comes Santa Claus (Right Down Santa Claus...,150266,1947,album,False,5ixB75BQR3ADoWQkcHQJTs,Gene Autry,48,71266
3,1dtIaSlyrLI04sqYa8nLyN,Rudolph the Red-Nosed Reindeer,186733,1947,album,False,5ixB75BQR3ADoWQkcHQJTs,Gene Autry,48,71266
4,4PS1e8f2LvuTFgUs1Cn3ON,The Christmas Song (Merry Christmas To You),192160,1962,album,False,7v4imS0moSyGdXyLgVTIV7,Nat King Cole,71,2312641


In [22]:
data = data.merge(chart_positions, on = 'track_id')

In [23]:
data.head()

Unnamed: 0,track_id,track_name,duration_ms,release_date,album_type,explicit,artist_id,artist,popularity,followers,chart_week,list_position
0,5CMVGP24paZIukljDj0iWc,Jingle Bells (with The Ken Lane Singers),156666,1945,compilation,False,0TPlgv7eySSCChc2vKRIml,Ken Lane Singers,28,151,2023-12-30,16
1,5CMVGP24paZIukljDj0iWc,Jingle Bells (with The Ken Lane Singers),156666,1945,compilation,False,0TPlgv7eySSCChc2vKRIml,Ken Lane Singers,28,151,2024-01-06,16
2,5CMVGP24paZIukljDj0iWc,Jingle Bells (with The Ken Lane Singers),156666,1945,compilation,False,1Mxqyy3pSjf8kZZL4QVxS0,Frank Sinatra,79,7480976,2023-12-30,16
3,5CMVGP24paZIukljDj0iWc,Jingle Bells (with The Ken Lane Singers),156666,1945,compilation,False,1Mxqyy3pSjf8kZZL4QVxS0,Frank Sinatra,79,7480976,2024-01-06,16
4,25leEEaz1gIpp7o21Fqyjo,Here Comes Santa Claus (Right Down Santa Claus...,150266,1947,album,False,5ixB75BQR3ADoWQkcHQJTs,Gene Autry,48,71266,2024-01-06,21


In [24]:
data = data[['track_name', 'artist', 'duration_ms', 
             'release_date', 'popularity', 'followers',
             'chart_week', 'list_position', 'track_id', 'artist_id']]

In [25]:
data['duration_ms'] = round(data['duration_ms']/1000, 2)

In [26]:
data['duration_ms'] = round(data['duration_ms']/60,1)

In [27]:
data = data.rename(columns = {'duration_ms':'duration_min'})

In [28]:
data.head()

Unnamed: 0,track_name,artist,duration_min,release_date,popularity,followers,chart_week,list_position,track_id,artist_id
0,Jingle Bells (with The Ken Lane Singers),Ken Lane Singers,2.6,1945,28,151,2023-12-30,16,5CMVGP24paZIukljDj0iWc,0TPlgv7eySSCChc2vKRIml
1,Jingle Bells (with The Ken Lane Singers),Ken Lane Singers,2.6,1945,28,151,2024-01-06,16,5CMVGP24paZIukljDj0iWc,0TPlgv7eySSCChc2vKRIml
2,Jingle Bells (with The Ken Lane Singers),Frank Sinatra,2.6,1945,79,7480976,2023-12-30,16,5CMVGP24paZIukljDj0iWc,1Mxqyy3pSjf8kZZL4QVxS0
3,Jingle Bells (with The Ken Lane Singers),Frank Sinatra,2.6,1945,79,7480976,2024-01-06,16,5CMVGP24paZIukljDj0iWc,1Mxqyy3pSjf8kZZL4QVxS0
4,Here Comes Santa Claus (Right Down Santa Claus...,Gene Autry,2.5,1947,48,71266,2024-01-06,21,25leEEaz1gIpp7o21Fqyjo,5ixB75BQR3ADoWQkcHQJTs


In [29]:
#data = data.merge(audio_features, on = 'track_id')

## Cleaning & joining

### Cleaning audio_features

Cleaned seperately for better visiblility of columns

- Got rid of null values
- Rounded and changed datatype of tempo-column from float to int
- Dropped redundant time_signature column

In [32]:
audio_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10783 entries, 0 to 10782
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          10776 non-null  object 
 1   danceability      10776 non-null  float64
 2   energy            10776 non-null  float64
 3   key               10776 non-null  float64
 4   loudness          10776 non-null  float64
 5   mode              10776 non-null  float64
 6   speechiness       10776 non-null  float64
 7   acousticness      10776 non-null  float64
 8   instrumentalness  10776 non-null  float64
 9   liveness          10776 non-null  float64
 10  valence           10776 non-null  float64
 11  tempo             10776 non-null  float64
 12  time_signature    10776 non-null  float64
dtypes: float64(12), object(1)
memory usage: 1.1+ MB


In [33]:
# Getting rid of null values since they are not going to be of use for us.
audio_features.dropna(inplace=True)

In [34]:
# rounding and changing datatype of tempo to int
audio_features['tempo'] = round(audio_features['tempo']).astype('int')

In [35]:
# dropping redundant columns
audio_features.drop(columns = 'time_signature', inplace=True)

### Cleaning the rest of the data

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188366 entries, 0 to 188365
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   track_name     188366 non-null  object 
 1   artist         188366 non-null  object 
 2   duration_min   188366 non-null  Float64
 3   release_date   188366 non-null  object 
 4   popularity     188366 non-null  Int64  
 5   followers      188366 non-null  Int64  
 6   chart_week     188366 non-null  dbdate 
 7   list_position  188366 non-null  Int64  
 8   track_id       188366 non-null  object 
 9   artist_id      188366 non-null  object 
dtypes: Float64(1), Int64(3), dbdate(1), object(5)
memory usage: 15.1+ MB


In [38]:
data['release_date'] = data['release_date'].str[:4]

In [39]:
data['release_date'] = pd.to_datetime(data['release_date'], format = '%Y')

In [40]:
data['release_date'] = data['release_date'].dt.year

In [41]:
data = data.rename(columns = {'release_date': 'release_year'})

In [42]:
# mask for filtering out songs released in 2000-2009
# mask = data[(data['release_year'] >= 2000) & (data['release_year'] <= 2009)]

In [43]:
data = data[data['release_year']>=2000]

In [44]:
data['chart_week'] = pd.to_datetime(data['chart_week'], format='%Y-%m-%d')

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 183612 entries, 848 to 188365
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   track_name     183612 non-null  object        
 1   artist         183612 non-null  object        
 2   duration_min   183612 non-null  Float64       
 3   release_year   183612 non-null  int32         
 4   popularity     183612 non-null  Int64         
 5   followers      183612 non-null  Int64         
 6   chart_week     183612 non-null  datetime64[ns]
 7   list_position  183612 non-null  Int64         
 8   track_id       183612 non-null  object        
 9   artist_id      183612 non-null  object        
dtypes: Float64(1), Int64(3), datetime64[ns](1), int32(1), object(4)
memory usage: 15.4+ MB


In [46]:
data.head()

Unnamed: 0,track_name,artist,duration_min,release_year,popularity,followers,chart_week,list_position,track_id,artist_id
848,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-05-06,2,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe
849,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-05-13,2,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe
850,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-04-29,3,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe
851,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-05-20,4,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe
852,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-05-27,4,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe


In [47]:
data = data.merge(audio_features, on = 'track_id')

In [49]:
data = data.drop_duplicates()

In [50]:
data = data.reset_index(drop=True)

In [52]:
final_data = data[(data['release_year'] >= 2000) & (data['release_year'] <= 2009)]

In [68]:
final_data["Peaceful_lounge_music"] = (
    (final_data["danceability"].between(0.2, 0.6)) &
    (final_data["tempo"] < 110) &
    (final_data["energy"].between(0.1, 0.4)) &
    (final_data["speechiness"] < 0.3)
).astype(int)
lounge_and_peaceful_music = final_data[final_data["Peaceful_lounge_music"] == 1]
lounge_and_peaceful_music_unique = lounge_and_peaceful_music.drop_duplicates(subset="track_name", keep='first')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data["Peaceful_lounge_music"] = (


In [70]:
lounge_and_peaceful_music_unique.sort_values('artist')

Unnamed: 0,track_name,artist,duration_min,release_year,popularity,followers,chart_week,list_position,track_id,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,Peaceful_lounge_music
25067,Miss You,Aaliyah,4.1,2005,69,4558858,2003-04-05,3,6qzgPGToY6vUtEAYfZLl14,0urTpYCsixqZwgNTkPJOJ4,0.588,0.353,10.0,-9.465,1.0,0.0372,0.588,0.0,0.129,0.516,109,1
29859,Like Red On a Rose,Alan Jackson,3.6,2006,74,2878853,2006-10-14,80,1ayFArNqsYgGT8gWWSscTD,4mxWe1mtYIYfP040G38yvS,0.598,0.315,11.0,-9.83,1.0,0.0282,0.634,0.191,0.0825,0.172,97,1
43410,Empire State of Mind (Part II) Broken Down,Alicia Keys,3.6,2009,81,12131689,2010-01-02,55,5sra5UY6sD658OabHL3QtI,3DiDSECUqqY1AuBP8qtaIa,0.484,0.368,6.0,-7.784,1.0,0.0341,0.74,3.8e-05,0.118,0.142,93,1
22458,Goodbye Time,Blake Shelton,3.4,2004,73,7252097,2005-07-23,73,1M9qgq0SaZ5OuAeU0GKXif,1UTPBmNbXNTittyMJrNkvw,0.51,0.348,2.0,-7.849,1.0,0.0261,0.295,2.5e-05,0.242,0.159,77,1
10156,I Wish You'd Stay,Brad Paisley,6.3,2001,68,2822412,2003-02-22,57,1Jhm8RRMQSTYX9ZaOMeUk8,13YmWQJFwgZrd4bf5IjMY4,0.472,0.29,7.0,-10.238,1.0,0.0257,0.286,7.2e-05,0.287,0.0906,94,1
25656,When I Get Where I'm Going (feat. Dolly Parton),Brad Paisley,4.1,2005,68,2822412,2006-02-25,39,3VLCtStwYsAL4LKZgeUvy3,13YmWQJFwgZrd4bf5IjMY4,0.519,0.379,0.0,-9.906,1.0,0.0279,0.619,2e-06,0.108,0.208,86,1
32916,I'll Walk,Bucky Covington,3.5,2007,35,145631,2008-11-15,70,3WsAMhyOHxYPrL1pyMAVZm,5sVu3ObJTpiln7yRQkMuje,0.509,0.389,7.0,-7.433,1.0,0.0272,0.8,0.0,0.113,0.302,77,1
4211,Temporary Home,Carrie Underwood,4.5,2009,72,5998552,2010-04-03,41,0d0tJF80562KcMndcBcSfM,4xFUf1FHVy696Q1JQZMTRj,0.416,0.354,0.0,-6.357,1.0,0.0263,0.731,1e-06,0.0925,0.181,81,1
3825,Stealing Cinderella,Chuck Wicks,4.0,2008,33,105933,2008-03-22,56,6aOdhGqD0xuFM0OauOedNl,696fbyLHSMBSYjDrDU5yiK,0.486,0.394,8.0,-8.728,1.0,0.0305,0.454,1.6e-05,0.122,0.32,69,1
38569,I'm Alive (with Dave Matthews),Dave Matthews,3.3,2008,56,409494,2009-11-28,32,12Nri9FR6o4Gpmrll3O2rW,13vQloYd6mP7V1mVwKJwS2,0.564,0.244,3.0,-11.149,1.0,0.0329,0.758,0.000607,0.106,0.346,78,1


In [76]:
round(lounge_and_peaceful_music_unique['duration_min'].sum()/60,2)

2.29