In [1]:
import numpy as np
import pandas as pd
import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("datasets/data.csv")
data.head()

Unnamed: 0,energy,tempo,danceability,genre,loudness,liveness,valence,artist,time_signature,speechiness,track_popularity,album,name,instrumentalness,mode,key,duration_ms,acousticness,id,popularity,release_year
0,0.592,157.969,0.521,pop,-7.777,0.122,0.535,"Lady Gaga, Bruno Mars",3.0,0.0304,100,Die With A Smile,Die With A Smile,0.0,0.0,6.0,251668.0,0.308,2plbrEY59IikOBgBGLjaoe,1,2024
1,0.507,104.978,0.747,pop,-10.171,0.117,0.438,Billie Eilish,4.0,0.0358,97,HIT ME HARD AND SOFT,BIRDS OF A FEATHER,0.0608,1.0,2.0,210373.0,0.2,6dOtVTDdiauQNBQEDOtlAB,1,2024
2,0.808,108.548,0.554,pop,-4.169,0.159,0.372,Gracie Abrams,4.0,0.0368,93,The Secret of Us (Deluxe),That’s So True,0.0,1.0,1.0,166300.0,0.214,7ne4VBA60CxGM75vw0EYad,1,2024
3,0.91,112.966,0.67,pop,-4.07,0.304,0.786,Sabrina Carpenter,4.0,0.0634,81,Short n' Sweet,Taste,0.0,0.0,0.0,157280.0,0.0939,1d7Ptw3qYcfpdLNL5REhtJ,1,2024
4,0.783,149.027,0.777,pop,-4.477,0.355,0.939,"ROSÉ, Bruno Mars",4.0,0.26,98,APT.,APT.,0.0,0.0,0.0,169917.0,0.0283,5vNRhkKd0yEAg8suGBpjeY,1,2024


In [3]:
data.duplicated().sum()

0

song specific data not needed for predicting whether a song will be popular or not

In [4]:
new_data = data.drop(columns=['id', 'artist', 'album', 'name', 'track_popularity', 'genre'])
new_data.head()

Unnamed: 0,energy,tempo,danceability,loudness,liveness,valence,time_signature,speechiness,instrumentalness,mode,key,duration_ms,acousticness,popularity,release_year
0,0.592,157.969,0.521,-7.777,0.122,0.535,3.0,0.0304,0.0,0.0,6.0,251668.0,0.308,1,2024
1,0.507,104.978,0.747,-10.171,0.117,0.438,4.0,0.0358,0.0608,1.0,2.0,210373.0,0.2,1,2024
2,0.808,108.548,0.554,-4.169,0.159,0.372,4.0,0.0368,0.0,1.0,1.0,166300.0,0.214,1,2024
3,0.91,112.966,0.67,-4.07,0.304,0.786,4.0,0.0634,0.0,0.0,0.0,157280.0,0.0939,1,2024
4,0.783,149.027,0.777,-4.477,0.355,0.939,4.0,0.26,0.0,0.0,0.0,169917.0,0.0283,1,2024


lets see how many records of songs are present which are released on and after 2000

In [5]:
new_data[new_data['release_year'] >= 2000].shape

(4311, 15)

In [6]:
new_data[new_data['release_year'] < 2000].shape

(407, 15)

majority of the songs are after 2000, so lets take the songs which are recent

In [7]:
new_data = new_data[new_data['release_year'] >= 2000]

age of the songs calculated from 2025

In [8]:
new_data['age'] = 2025 - new_data['release_year']
new_data = new_data.drop(columns=['release_year'])

In [9]:
new_data.head()

Unnamed: 0,energy,tempo,danceability,loudness,liveness,valence,time_signature,speechiness,instrumentalness,mode,key,duration_ms,acousticness,popularity,age
0,0.592,157.969,0.521,-7.777,0.122,0.535,3.0,0.0304,0.0,0.0,6.0,251668.0,0.308,1,1
1,0.507,104.978,0.747,-10.171,0.117,0.438,4.0,0.0358,0.0608,1.0,2.0,210373.0,0.2,1,1
2,0.808,108.548,0.554,-4.169,0.159,0.372,4.0,0.0368,0.0,1.0,1.0,166300.0,0.214,1,1
3,0.91,112.966,0.67,-4.07,0.304,0.786,4.0,0.0634,0.0,0.0,0.0,157280.0,0.0939,1,1
4,0.783,149.027,0.777,-4.477,0.355,0.939,4.0,0.26,0.0,0.0,0.0,169917.0,0.0283,1,1


In [10]:
new_data.tail()

Unnamed: 0,energy,tempo,danceability,loudness,liveness,valence,time_signature,speechiness,instrumentalness,mode,key,duration_ms,acousticness,popularity,age
4713,0.424,75.466,0.352,-8.009,0.242,0.605,1.0,0.0634,0.0,1.0,10.0,661293.0,0.965,0,12
4714,0.349,147.193,0.471,-16.633,0.286,0.358,4.0,0.0656,0.945,1.0,8.0,394400.0,0.975,0,23
4715,0.595,174.308,0.436,-11.494,0.0939,0.87,4.0,0.0671,0.868,1.0,3.0,473440.0,0.816,0,15
4716,0.591,174.76,0.489,-10.843,0.077,0.689,1.0,0.0813,0.927,0.0,8.0,298987.0,0.948,0,18
4717,0.816,170.938,0.27,-3.815,0.317,0.591,4.0,0.0598,0.113,1.0,8.0,645701.0,0.853,0,9


In [11]:
num_features = ["energy", "tempo", "danceability", "loudness", "liveness", "valence", "instrumentalness", "duration_ms", "acousticness", "age", "speechiness"]

In [12]:
scaled_data = pd.DataFrame()
scaled_data['popularity'] = new_data['popularity']

scaling the numerical features

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

for feature in num_features:
    scaled_data[feature] = scaler.fit_transform(new_data[[feature]])

hot encoding for categorical features

In [14]:
keydum = pd.get_dummies(new_data["key"], prefix="key").astype(int)
tsdum = pd.get_dummies(new_data["time_signature"], prefix="timesign").astype(int)

In [15]:
scaled_data = pd.concat([scaled_data, keydum, tsdum], axis=1)

In [16]:
scaled_data.tail(10)

Unnamed: 0,popularity,energy,tempo,danceability,loudness,liveness,valence,instrumentalness,duration_ms,acousticness,age,speechiness,key_0.0,key_1.0,key_2.0,key_3.0,key_4.0,key_5.0,key_6.0,key_7.0,key_8.0,key_9.0,key_10.0,key_11.0,timesign_1.0,timesign_3.0,timesign_4.0,timesign_5.0
4707,0,0.862016,-0.286532,0.371216,0.600703,0.1357,1.394452,-0.593973,-0.192665,-0.288598,0.094275,-0.377161,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4708,0,0.337105,1.269754,-0.175344,0.447917,0.387458,-0.207327,-0.593791,-0.550702,-1.028975,0.094275,-0.650549,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
4709,0,-0.187806,1.059305,-0.753743,-0.2503,-0.670738,0.760738,-0.594,7.497226,1.675168,0.264537,0.2819,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4710,0,1.120434,0.280345,0.318152,0.733489,0.614852,1.52275,-0.594,7.182277,1.629428,0.264537,-0.011016,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4711,0,0.308841,-0.912072,-1.040289,0.103026,-0.985029,1.126193,-0.593994,1.908394,0.98602,0.264537,-0.350798,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
4713,0,-0.627924,-1.491224,-1.454189,0.180711,0.606731,0.519694,-0.594,6.054747,1.864226,1.115852,-0.397665,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
4714,0,-0.930757,1.002972,-0.822726,-0.992598,0.964065,-0.440596,2.020496,2.556133,1.894719,2.988743,-0.376184,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
4715,0,0.062536,1.945855,-1.00845,-0.293428,-0.596022,1.549964,1.807463,3.592243,1.409876,1.62664,-0.361539,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
4716,0,0.046385,1.961572,-0.727211,-0.204859,-0.733271,0.84627,1.970696,1.305395,1.812387,2.137429,-0.222892,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
4717,0,0.954885,1.828668,-1.889314,0.751312,1.215823,0.465265,-0.281367,5.850357,1.522701,0.605063,-0.432815,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


save the data for prediction

In [17]:
scaled_data.to_csv("datasets/data_prediction.csv", index=False)