In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

# Load Data and pre-processing

In [4]:
spotify = pd.read_csv('./data/spotify_data/Spotify_Dataset_V3.csv', sep=';', parse_dates=['Date'], dayfirst=True)
spotify.head()

Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,# of Artist,Artist (Ind.),# of Nationality,Nationality,Continent,Points (Total),Points (Ind for each Artist/Nat),id,Song URL
0,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2023-05-29,0.668,0.758,-5176.0,0.033,0.483,0.0,0.834,Artist 1,Eslabon Armado,Nationality 1,Mexico,Latin-America,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...
1,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2023-05-29,0.668,0.758,-5176.0,0.033,0.483,0.0,0.834,Artist 2,Peso Pluma,Nationality 2,Mexico,Latin-America,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...
2,2,WHERE SHE GOES,Bad Bunny,2023-05-29,0.652,0.8,-4019.0,0.061,0.143,0.629,0.234,Artist 1,Bad Bunny,Nationality 1,Puerto Rico,Latin-America,199,199.0,7ro0hRteUMfnOioTFI5TG1,https://open.spotify.com/track/7ro0hRteUMfnOio...
3,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",2023-05-29,0.812,0.479,-5678.0,0.333,0.213,0.0,0.559,Artist 1,Yng Lvcas,Nationality 1,Mexico,Latin-America,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...
4,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",2023-05-29,0.812,0.479,-5678.0,0.333,0.213,0.0,0.559,Artist 2,Peso Pluma,Nationality 2,Mexico,Latin-America,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...


In [5]:
print(spotify.shape)
print(spotify.columns)

(651936, 20)
Index(['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality',
       'Nationality', 'Continent', 'Points (Total)',
       'Points (Ind for each Artist/Nat)', 'id', 'Song URL'],
      dtype='object')


## Data Cleaning

In [6]:
df1 = spotify[['Title', 'id']].drop_duplicates()
df2 =spotify[['Title', 'id']].drop_duplicates(subset=['id'])
Title1 = df1['Title'].values
Title2 = df2['Title'].values
dirty_data = np.setdiff1d(Title1, Title2, assume_unique=False)
print(len(dirty_data))

for title in dirty_data:
    index = spotify[spotify['Title']==title].iloc[0]['id']
    spotify.loc[spotify['id']==index, 'Title'] = title

39


# Explore the relationship among features associated with tracks and the points

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,  OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [12]:
selected_columns = ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', 'Points (Total)']
df = spotify[(spotify['Date']>='2017-01-01') & (spotify['Date']<'2018-01-01')][selected_columns]
df.head()

Unnamed: 0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Points (Total)
555508,0.62,0.574,-7788.0,0.048,0.569,0.0,0.357,167
555509,0.62,0.574,-7788.0,0.048,0.569,0.0,0.357,167
555510,0.62,0.574,-7788.0,0.048,0.569,0.0,0.357,167
555511,0.818,0.803,-4282.0,0.08,0.034,0.0,0.632,44
555512,0.548,0.65,-5827.0,0.059,0.219,0.0,0.557,74


In [15]:
X = df.drop('Points (Total)', axis=1)
y = df['Points (Total)']

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,random_state=0)

preprocessor = make_column_transformer(
    (StandardScaler(),
    make_column_selector(dtype_include=np.number)),
)

X_train_std = preprocessor.fit_transform(X_train)
X_val =  preprocessor.transform(X_val)

## Baseline -- should better than this

In [14]:
# median_array = np.full_like(y_val, np.median(y_train))

# print("MAE: {}".format(mean_absolute_error(y_val, median_array)))

MAE: 49.98019288603132


In [16]:
def cal_mae(y_test, y_pred):
    print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))

## DecisionTreeRegression

In [18]:
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(random_state=0).fit(X_train_std, y_train)
preds = clf.predict(X_val)

cal_mae(y_val, preds)

MAE: 30.684709959018967


### Take Date_Interval and Nationality into consider

#### Pre-processing

In [19]:
selected_columns = ['id', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', 'Nationality', 'Date', 'Points (Total)']
df = spotify[(spotify['Date']>='2017-01-01') & (spotify['Date']<'2018-01-01')][selected_columns]
df.head()

for idx in df['id'].unique():
    df.loc[spotify['id']==idx, 'Date_Interval'] = df[df['id']==idx]['Date'] - df[df['id']==idx]['Date'].min()
    
df['Date_Interval_num'] = df['Date_Interval'].dt.days

In [20]:
X = df.drop(['id','Points (Total)', 'Date_Interval'], axis=1)
y = df['Points (Total)']

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,random_state=0)

preprocessor = make_column_transformer(
    (StandardScaler(),
    make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(sparse_output=False),
    make_column_selector(dtype_include=object))
)

X_train_std = preprocessor.fit_transform(X_train)
X_val =  preprocessor.transform(X_val)

#### Models

In [21]:
clf = DecisionTreeRegressor(random_state=0).fit(X_train_std, y_train)
preds = clf.predict(X_val)

cal_mae(y_val, preds)

MAE: 5.5795153652044664
