# Import Packages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Read and Describe Data

In [2]:
# data sourced from kaggle
# https://www.kaggle.com/datasets/joebeachcapital/30000-spotify-songs/data

df = pd.read_csv('./spotify_songs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudne

| Field                    | Type      | Description                                                                                                                                                                                                                                                                                                                                                     |
|--------------------------|-----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| track_id                 | character | Song unique ID                                                                                                                                                                                                                                                                                                                                                  |
| track_name               | character | Song name                                                                                                                                                                                                                                                                                                                                                       |
| track_artist             | character | Song artist                                                                                                                                                                                                                                                                                                                                                     |
| track_popularity         | double    | Song popularity (0-100) where higher is better                                                                                                                                                                                                                                                                                                                  |
| track_album_id           | character | Album unique ID                                                                                                                                                                                                                                                                                                                                                 |
| track_album_name         | character | Song album name                                                                                                                                                                                                                                                                                                                                                 |
| track_album_release_date | character | Date when album was released                                                                                                                                                                                                                                                                                                                                    |
| playlist_name            | character | Name of playlist                                                                                                                                                                                                                                                                                                                                                |
| playlist_id              | character | Playlist ID                                                                                                                                                                                                                                                                                                                                                     |
| playlist_genre           | character | Playlist genre                                                                                                                                                                                                                                                                                                                                                  |
| playlist_subgenre        | character | Playlist subgenre                                                                                                                                                                                                                                                                                                                                               |
| danceability             | double    | Describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.                                                                                                                                 |
| energy                   | double    | Measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.                                                                                                                                   |
| key                      | double    | The estimated overall key of the track. Integers map to pitches using standard Pitch Class notation . E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. If no key was detected, the value is -1.                                                                                                                                                                         |
| loudness                 | double    | The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db.                                    |
| mode                     | double    | Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0.                                                                                                                                                                                                 |
| speechiness              | double    | Detects presence of spoken words. The more exclusively speech-like the recording (talk show, etc.), the closer to 1.0 the value. Values above 0.66 are probably made entirely of spoken words. Values between 0.33 and 0.66 may contain music and speech (sections or layered). Values below 0.33 most likely represent music and other non-speech-like tracks. |
| acousticness             | double    | A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.                                                                                                                                                                                                                                    |
| instrumentalness         | double    | Predicts whether a track contains vocals. "Ooh" and "aah" treated as instrumental. Rap or spoken word are clearly "vocal". The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.   |
| liveness                 | double    | Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.                                                                                                                                         |
| valence                  | double    | A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).                                                                                                               |
| tempo                    | double    | The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.                                                                                                                                                                      |
| duration_ms              | double    | Duration of song in milliseconds                                                                                                                                                                                                                                                                                                                                |


# Feature Engineering and Selection

In [4]:
# creating a feature for the release year
# month and day are likely too granular to have an impact on popularity

df['track_album_release_date'] = pd.to_datetime(
    df['track_album_release_date'], format='mixed'
)

df['track_album_release_year'] = df['track_album_release_date'].dt.year

In [5]:
# choosing my features

categorical_features = ['track_name', 'track_artist', 'track_album_name']
numerical_features = [
    'track_album_release_year', 'danceability', 'energy', 'key', 'loudness',
    'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
    'valence', 'tempo', 'duration_ms'
]
all_features = categorical_features + numerical_features

# splitting into features and target

X = df[all_features]
y = df['track_popularity']

X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
track_album_release_year,32833.0,2011.13727,11.417451,1957.0,2008.0,2016.0,2019.0,2020.0
danceability,32833.0,0.65485,0.145085,0.0,0.563,0.672,0.761,0.983
energy,32833.0,0.698619,0.18091,0.000175,0.581,0.721,0.84,1.0
key,32833.0,5.374471,3.611657,0.0,2.0,6.0,9.0,11.0
loudness,32833.0,-6.719499,2.988436,-46.448,-8.171,-6.166,-4.645,1.275
mode,32833.0,0.565711,0.495671,0.0,0.0,1.0,1.0,1.0
speechiness,32833.0,0.107068,0.101314,0.0,0.041,0.0625,0.132,0.918
acousticness,32833.0,0.175334,0.219633,0.0,0.0151,0.0804,0.255,0.994
instrumentalness,32833.0,0.084747,0.22423,0.0,0.0,1.6e-05,0.00483,0.994
liveness,32833.0,0.190176,0.154317,0.0,0.0927,0.127,0.248,0.996


In [6]:
# converting non-float64 to float64 for feature scaling
X = X.astype({
    'track_album_release_year': 'float64',
    'key': 'float64',
    'mode': 'float64',
    'duration_ms': 'float64'
})

X.dtypes

track_name                   object
track_artist                 object
track_album_name             object
track_album_release_year    float64
danceability                float64
energy                      float64
key                         float64
loudness                    float64
mode                        float64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                 float64
dtype: object

# Feature Scaling

In [7]:
scaler = MinMaxScaler()

# scale the track_album_release_year?

X.loc[:, ['key', 'loudness', 'tempo', 'duration_ms']] = scaler.fit_transform(
    X[['key', 'loudness', 'tempo', 'duration_ms']]
)

X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
track_album_release_year,32833.0,2011.13727,11.417451,1957.0,2008.0,2016.0,2019.0,2020.0
danceability,32833.0,0.65485,0.145085,0.0,0.563,0.672,0.761,0.983
energy,32833.0,0.698619,0.18091,0.000175,0.581,0.721,0.84,1.0
key,32833.0,0.488588,0.328332,0.0,0.181818,0.545455,0.818182,1.0
loudness,32833.0,0.832481,0.06262,0.0,0.802066,0.844079,0.875951,1.0
mode,32833.0,0.565711,0.495671,0.0,0.0,1.0,1.0,1.0
speechiness,32833.0,0.107068,0.101314,0.0,0.041,0.0625,0.132,0.918
acousticness,32833.0,0.175334,0.219633,0.0,0.0151,0.0804,0.255,0.994
instrumentalness,32833.0,0.084747,0.22423,0.0,0.0,1.6e-05,0.00483,0.994
liveness,32833.0,0.190176,0.154317,0.0,0.0927,0.127,0.248,0.996


# Train/Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)