# Hyper Cruises - ELC

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
from google.auth import load_credentials_from_file
from google.cloud.bigquery import Client

## Loading data from BigQuery

In [4]:
credentials, project_id = load_credentials_from_file('service_account.json')

In [5]:
# Load data from BigQuery
client = Client(
    project = project_id,
    credentials=credentials
)

In [6]:
query = "SELECT * FROM `da26-python.music_data.tracks`"

In [7]:
load_job = client.query(query)

In [8]:
data = load_job.to_dataframe()



In [9]:
def load_data(table):
    query = f"SELECT * FROM `da26-python.music_data.{table}`"
    load_job = client.query(query)
    data = load_job.to_dataframe()
    return data

In [10]:
artists = load_data('artists')

In [11]:
audio_features = load_data('audio_features')

In [12]:
chart_positions = load_data('chart_positions')

In [13]:
mapping = load_data('tracks_artists_mapping')

In [14]:
tracks = load_data('tracks')

## Joining and cleaning

### Joining together tracks, artists, chart_positions and mapping table together

Will later on be joined with the audio_features once cleaned

In [17]:
data = tracks.merge(mapping, on = 'track_id' )

In [18]:
data = data.merge(artists, on = 'artist_id')

In [19]:
data.rename(columns = {'name_x':'track_name', 'name_y':'artist'}, inplace = True)

In [20]:
data = data.merge(chart_positions, on = 'track_id')

In [21]:
data = data[['track_name', 'artist', 'duration_ms', 
             'release_date', 'popularity', 'followers',
             'chart_week', 'list_position', 'track_id', 'artist_id']]

### Cleaning audio_features

Cleaned seperately for better visiblility of columns

- Got rid of null values
- Rounded and changed datatype of tempo-column from float to int
- Dropped redundant time_signature column

In [23]:
audio_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10783 entries, 0 to 10782
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          10776 non-null  object 
 1   danceability      10776 non-null  float64
 2   energy            10776 non-null  float64
 3   key               10776 non-null  float64
 4   loudness          10776 non-null  float64
 5   mode              10776 non-null  float64
 6   speechiness       10776 non-null  float64
 7   acousticness      10776 non-null  float64
 8   instrumentalness  10776 non-null  float64
 9   liveness          10776 non-null  float64
 10  valence           10776 non-null  float64
 11  tempo             10776 non-null  float64
 12  time_signature    10776 non-null  float64
dtypes: float64(12), object(1)
memory usage: 1.1+ MB


In [24]:
# Getting rid of null values
audio_features = audio_features.dropna()

In [25]:
# rounding and changing datatype of tempo to int
audio_features['tempo'] = round(audio_features['tempo']).astype('int')

In [26]:
# dropping redundant columns
audio_features.drop(columns = 'time_signature', inplace=True)

### Cleaning previously joined together data

#### Track duration format from miliseconds to minutes

In [29]:
# Changing track duration format from miliseconds to minutes
data['duration_ms'] = round((data['duration_ms']/1000)/60,1)

In [30]:
data = data.rename(columns = {'duration_ms':'duration_min'})

#### release_date to release_year

In [32]:
data['release_date'] = data['release_date'].str[:4]

In [33]:
data['release_date'] = pd.to_datetime(data['release_date'], format = '%Y')

In [34]:
data['release_date'] = data['release_date'].dt.year

In [35]:
data = data.rename(columns = {'release_date': 'release_year'})

In [36]:
data['chart_week'] = pd.to_datetime(data['chart_week'], format='%Y-%m-%d')

### Joining together data with audio_features

Removing duplicates and resetting index. Filtering for songs released in 2000-2009

In [38]:
data = data.merge(audio_features, on = 'track_id')

In [39]:
data = data.drop_duplicates()

In [40]:
data = data.reset_index(drop=True)

In [41]:
final_data = data[(data['release_year'] >= 2000) & (data['release_year'] <= 2009)]

In [42]:
final_data

Unnamed: 0,track_name,artist,duration_min,release_year,popularity,followers,chart_week,list_position,track_id,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
733,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-05-06,2,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe,0.739,0.947,11.0,-1.916,0.0,0.0411,0.00916,0.000031,0.326,0.766,88
734,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-05-13,2,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe,0.739,0.947,11.0,-1.916,0.0,0.0411,0.00916,0.000031,0.326,0.766,88
735,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-04-29,3,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe,0.739,0.947,11.0,-1.916,0.0,0.0411,0.00916,0.000031,0.326,0.766,88
736,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-05-20,4,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe,0.739,0.947,11.0,-1.916,0.0,0.0411,0.00916,0.000031,0.326,0.766,88
737,He Wasn't Man Enough,Toni Braxton,4.4,2000,65,4234164,2000-05-27,4,7f1Dmr246cJ9uQYdbplTbh,3X458ddYA2YcVWuVIGGOYe,0.739,0.947,11.0,-1.916,0.0,0.0411,0.00916,0.000031,0.326,0.766,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127924,Lemonade,Gucci Mane,4.1,2009,77,5661054,2010-05-01,69,6rUcS9i07F6okIe8wujs5J,13y7CgLHjMVRMDqxdx0Xdo,0.741,0.658,7.0,-7.758,1.0,0.0853,0.64300,0.000003,0.307,0.746,142
127925,Lemonade,Gucci Mane,4.1,2009,77,5661054,2010-03-06,74,6rUcS9i07F6okIe8wujs5J,13y7CgLHjMVRMDqxdx0Xdo,0.741,0.658,7.0,-7.758,1.0,0.0853,0.64300,0.000003,0.307,0.746,142
127926,Lemonade,Gucci Mane,4.1,2009,77,5661054,2010-02-27,81,6rUcS9i07F6okIe8wujs5J,13y7CgLHjMVRMDqxdx0Xdo,0.741,0.658,7.0,-7.758,1.0,0.0853,0.64300,0.000003,0.307,0.746,142
127927,Lemonade,Gucci Mane,4.1,2009,77,5661054,2010-05-08,86,6rUcS9i07F6okIe8wujs5J,13y7CgLHjMVRMDqxdx0Xdo,0.741,0.658,7.0,-7.758,1.0,0.0853,0.64300,0.000003,0.307,0.746,142
