In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

# Load Data and pre-processing

In [2]:
spotify = pd.read_csv('./data/spotify_data/Spotify_Dataset_V3.csv', sep=';', parse_dates=['Date'], dayfirst=True)
spotify.head()

Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,# of Artist,Artist (Ind.),# of Nationality,Nationality,Continent,Points (Total),Points (Ind for each Artist/Nat),id,Song URL
0,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2023-05-29,0.668,0.758,-5176.0,0.033,0.483,0.0,0.834,Artist 1,Eslabon Armado,Nationality 1,Mexico,Latin-America,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...
1,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2023-05-29,0.668,0.758,-5176.0,0.033,0.483,0.0,0.834,Artist 2,Peso Pluma,Nationality 2,Mexico,Latin-America,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...
2,2,WHERE SHE GOES,Bad Bunny,2023-05-29,0.652,0.8,-4019.0,0.061,0.143,0.629,0.234,Artist 1,Bad Bunny,Nationality 1,Puerto Rico,Latin-America,199,199.0,7ro0hRteUMfnOioTFI5TG1,https://open.spotify.com/track/7ro0hRteUMfnOio...
3,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",2023-05-29,0.812,0.479,-5678.0,0.333,0.213,0.0,0.559,Artist 1,Yng Lvcas,Nationality 1,Mexico,Latin-America,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...
4,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",2023-05-29,0.812,0.479,-5678.0,0.333,0.213,0.0,0.559,Artist 2,Peso Pluma,Nationality 2,Mexico,Latin-America,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...


In [3]:
print(spotify.shape)
print(spotify.columns)

(651936, 20)
Index(['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality',
       'Nationality', 'Continent', 'Points (Total)',
       'Points (Ind for each Artist/Nat)', 'id', 'Song URL'],
      dtype='object')


## Data Cleaning

In [4]:
df1 = spotify[['Title', 'id']].drop_duplicates()
df2 =spotify[['Title', 'id']].drop_duplicates(subset=['id'])
Title1 = df1['Title'].values
Title2 = df2['Title'].values
dirty_data = np.setdiff1d(Title1, Title2, assume_unique=False)
print(len(dirty_data))

for title in dirty_data:
    index = spotify[spotify['Title']==title].iloc[0]['id']
    spotify.loc[spotify['id']==index, 'Title'] = title

39


# Explore the relationship among features associated with tracks and the points

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,  OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [6]:
selected_columns = ['Artists','Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', 'Points (Total)']
df = spotify[(spotify['Date']>='2017-01-01') & (spotify['Date']<'2018-01-01')][selected_columns]
df.head()

Unnamed: 0,Artists,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Points (Total)
555508,"Logic, Alessia Cara, Khalid",0.62,0.574,-7788.0,0.048,0.569,0.0,0.357,167
555509,"Logic, Alessia Cara, Khalid",0.62,0.574,-7788.0,0.048,0.569,0.0,0.357,167
555510,"Logic, Alessia Cara, Khalid",0.62,0.574,-7788.0,0.048,0.569,0.0,0.357,167
555511,Bruno Mars,0.818,0.803,-4282.0,0.08,0.034,0.0,0.632,44
555512,David Guetta,0.548,0.65,-5827.0,0.059,0.219,0.0,0.557,74


In [7]:
X = df.drop('Points (Total)', axis=1)
y = df['Points (Total)']

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,random_state=0)

preprocessor = make_column_transformer(
    (StandardScaler(),
    make_column_selector(dtype_include=np.number)),
)

X_train_std = preprocessor.fit_transform(X_train)
X_val =  preprocessor.transform(X_val)

## Baseline -- should better than this

In [8]:
# median_array = np.full_like(y_val, np.median(y_train))

# print("MAE: {}".format(mean_absolute_error(y_val, median_array)))

# MAE: 49.84513809671955

In [9]:
def cal_mae(y_test, y_pred):
    print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))

## DecisionTreeRegression

In [10]:
from sklearn.tree import DecisionTreeRegressor, export_graphviz

clf = DecisionTreeRegressor(random_state=0).fit(X_train_std, y_train)
preds = clf.predict(X_val)

cal_mae(y_val, preds)

MAE: 30.684709959018967


### Take Date_Interval and Nationality into consider

#### Pre-processing

In [11]:
selected_columns = ['id', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', 'Nationality', 'Date', 'Points (Total)']
df = spotify[(spotify['Date']>='2017-01-01') & (spotify['Date']<'2018-01-01')][selected_columns]
df.head()

for idx in df['id'].unique():
    df.loc[spotify['id']==idx, 'Date_Interval'] = df[df['id']==idx]['Date'] - df[df['id']==idx]['Date'].min()
    
df['Date_Interval_num'] = df['Date_Interval'].dt.days

In [12]:
df = df.set_index(["Date"]).to_period("D").sort_index()
df.index.name = "Date"
display(df)

Unnamed: 0_level_0,id,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Nationality,Points (Total),Date_Interval,Date_Interval_num
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-01-01,2KpCpk6HjXXLb7nnXoXA5O,0.757,0.882,-6125.00,0.248,0.076,0.0,0.684,United States,1,0 days,0
2017-01-01,3AsAuGTaDQzavZZThyYlop,0.803,0.569,-7392.00,0.074,0.062,0.0,0.809,Sweden,139,0 days,0
2017-01-01,5hYTyyh2odQKphUbMqc5gN,0.314,0.555,-9601.00,0.370,0.157,0.0,0.159,Canada,140,0 days,0
2017-01-01,11KJSRSgaDxqydKYiD2Jew,0.804,0.648,-7805.00,0.117,0.057,0.0,0.392,Barbados,141,0 days,0
2017-01-01,11KJSRSgaDxqydKYiD2Jew,0.804,0.648,-7805.00,0.117,0.057,0.0,0.392,Canada,141,0 days,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-31,5knuzwU65gJK7IF5yJsuaW,0.720,0.763,-4068.00,0.052,0.406,0.0,0.742,United Kingdom,100,364 days,364
2017-12-31,7wGoVu4Dady5GV0Sv4UIsx,0.577,0.522,-6594.00,0.098,0.130,0.0,0.119,United States,200,107 days,107
2017-12-31,7wGoVu4Dady5GV0Sv4UIsx,0.577,0.522,-6594.00,0.098,0.130,0.0,0.119,United Kingdom,200,107 days,107
2017-12-31,7sO5G9EABYOXQKNPNiE9NR,0.880,0.428,-8.28,0.206,0.149,0.0,0.333,United States,145,61 days,61


### Take trend of audio features into account

In [13]:
data_dict = {}
acoustic_features = ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']
for col in acoustic_features:
    daily_list = []
    for date in spotify['Date'].sort_values().unique():
        spotify_date = spotify[spotify["Date"]==date]
        daily_feature = (spotify_date[col]*spotify_date['Points (Ind for each Artist/Nat)'] / 200).sum()
        daily_list.append(daily_feature)
    data_dict[col] = daily_list

In [14]:
df_features_daily = pd.DataFrame(data=data_dict, index=spotify['Date'].sort_values().unique())
df_features_daily.index.name="Date"
df_features_daily.columns = [f"{col}_weighted_sum" for col in df_features_daily.columns]
display(df_features_daily)

Unnamed: 0_level_0,Danceability_weighted_sum,Energy_weighted_sum,Loudness_weighted_sum,Speechiness_weighted_sum,Acousticness_weighted_sum,Instrumentalness_weighted_sum,Valence_weighted_sum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-01-01,69.377595,68.324210,-528449.376735,12.657880,18.185580,0.849185,52.479965
2017-01-02,68.676675,67.603040,-531479.766500,12.490530,18.928425,0.828535,50.730020
2017-01-03,68.670750,67.410175,-532916.107700,12.573005,18.957670,0.859855,50.528910
2017-01-04,68.591655,67.417870,-532262.555650,12.581075,18.929845,0.886650,50.433440
2017-01-05,68.640970,67.429635,-531019.541800,12.573120,19.031535,0.897255,50.396855
...,...,...,...,...,...,...,...
2023-05-25,66.963870,64.260535,-548329.186300,8.728370,24.888685,1.664180,50.532920
2023-05-26,67.359475,64.694715,-555473.855749,8.689365,24.746910,1.551420,49.940885
2023-05-27,67.465020,65.408515,-544934.840150,8.500770,25.305115,1.610320,51.208135
2023-05-28,66.720785,64.764210,-553769.450299,8.032425,25.973500,1.773575,50.066990


In [15]:
from statsmodels.tsa.deterministic import DeterministicProcess

dp = DeterministicProcess(
    index=df_features_daily.index,
    order=1,
)

X = dp.in_sample()
X = X.to_period("D")

In [16]:
df_features_daily = df_features_daily.reset_index().set_index("Date").to_period("D").sort_index()
df_features_daily.index

PeriodIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
             '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
             '2017-01-09', '2017-01-10',
             ...
             '2023-05-20', '2023-05-21', '2023-05-22', '2023-05-23',
             '2023-05-24', '2023-05-25', '2023-05-26', '2023-05-27',
             '2023-05-28', '2023-05-29'],
            dtype='period[D]', name='Date', length=2336)

In [17]:
def make_lags(ts, lags):
    return pd.concat(
        [ts.shift(i).add_suffix(f'_lag_{i}') for i in range(1, lags + 1)],
        axis=1
    )

df_features_daily_lag = make_lags(df_features_daily, lags=9).dropna()
display(df_features_daily_lag)

Unnamed: 0_level_0,Danceability_weighted_sum_lag_1,Energy_weighted_sum_lag_1,Loudness_weighted_sum_lag_1,Speechiness_weighted_sum_lag_1,Acousticness_weighted_sum_lag_1,Instrumentalness_weighted_sum_lag_1,Valence_weighted_sum_lag_1,Danceability_weighted_sum_lag_2,Energy_weighted_sum_lag_2,Loudness_weighted_sum_lag_2,...,Acousticness_weighted_sum_lag_8,Instrumentalness_weighted_sum_lag_8,Valence_weighted_sum_lag_8,Danceability_weighted_sum_lag_9,Energy_weighted_sum_lag_9,Loudness_weighted_sum_lag_9,Speechiness_weighted_sum_lag_9,Acousticness_weighted_sum_lag_9,Instrumentalness_weighted_sum_lag_9,Valence_weighted_sum_lag_9
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-10,68.497610,66.692175,-537314.334100,12.333435,20.175175,0.852660,50.471165,68.477675,66.666070,-536472.345150,...,18.928425,0.828535,50.730020,69.377595,68.324210,-528449.376735,12.657880,18.185580,0.849185,52.479965
2017-01-11,68.433955,66.901160,-533185.160650,12.371390,19.757435,0.901400,50.305375,68.497610,66.692175,-537314.334100,...,18.957670,0.859855,50.528910,68.676675,67.603040,-531479.766500,12.490530,18.928425,0.828535,50.730020
2017-01-12,68.482365,67.004385,-531668.934600,12.438665,19.634800,0.899350,50.291045,68.433955,66.901160,-533185.160650,...,18.929845,0.886650,50.433440,68.670750,67.410175,-532916.107700,12.573005,18.957670,0.859855,50.528910
2017-01-13,68.505835,67.097530,-530672.603700,12.454370,19.566690,0.917675,50.240675,68.482365,67.004385,-531668.934600,...,19.031535,0.897255,50.396855,68.591655,67.417870,-532262.555650,12.581075,18.929845,0.886650,50.433440
2017-01-14,68.269850,67.364715,-538083.712750,12.071570,19.577945,1.475970,49.633500,68.505835,67.097530,-530672.603700,...,19.938170,0.840505,50.311710,68.640970,67.429635,-531019.541800,12.573120,19.031535,0.897255,50.396855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-25,66.718410,64.287120,-549433.693500,8.738500,25.245575,1.661005,49.808990,66.669490,64.308370,-549484.754651,...,24.468045,1.041665,50.417515,66.487785,64.217265,-544597.837550,8.885965,24.924235,1.226250,50.100440
2023-05-26,66.963870,64.260535,-548329.186300,8.728370,24.888685,1.664180,50.532920,66.718410,64.287120,-549433.693500,...,24.511735,0.983735,50.467845,66.615400,64.474580,-540035.935151,8.892255,24.468045,1.041665,50.417515
2023-05-27,67.359475,64.694715,-555473.855749,8.689365,24.746910,1.551420,49.940885,66.963870,64.260535,-548329.186300,...,24.844560,1.664575,50.337280,66.704105,64.529280,-536108.517249,8.932130,24.511735,0.983735,50.467845
2023-05-28,67.465020,65.408515,-544934.840150,8.500770,25.305115,1.610320,51.208135,67.359475,64.694715,-555473.855749,...,24.893265,1.662340,51.439680,66.968285,64.473410,-537896.755799,8.999235,24.844560,1.664575,50.337280


In [18]:
df_features_daily_lag = pd.merge(df_features_daily_lag, X, on=["Date"], how='left')
df_features_daily_lag

Unnamed: 0_level_0,Danceability_weighted_sum_lag_1,Energy_weighted_sum_lag_1,Loudness_weighted_sum_lag_1,Speechiness_weighted_sum_lag_1,Acousticness_weighted_sum_lag_1,Instrumentalness_weighted_sum_lag_1,Valence_weighted_sum_lag_1,Danceability_weighted_sum_lag_2,Energy_weighted_sum_lag_2,Loudness_weighted_sum_lag_2,...,Instrumentalness_weighted_sum_lag_8,Valence_weighted_sum_lag_8,Danceability_weighted_sum_lag_9,Energy_weighted_sum_lag_9,Loudness_weighted_sum_lag_9,Speechiness_weighted_sum_lag_9,Acousticness_weighted_sum_lag_9,Instrumentalness_weighted_sum_lag_9,Valence_weighted_sum_lag_9,trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-10,68.497610,66.692175,-537314.334100,12.333435,20.175175,0.852660,50.471165,68.477675,66.666070,-536472.345150,...,0.828535,50.730020,69.377595,68.324210,-528449.376735,12.657880,18.185580,0.849185,52.479965,10.0
2017-01-11,68.433955,66.901160,-533185.160650,12.371390,19.757435,0.901400,50.305375,68.497610,66.692175,-537314.334100,...,0.859855,50.528910,68.676675,67.603040,-531479.766500,12.490530,18.928425,0.828535,50.730020,11.0
2017-01-12,68.482365,67.004385,-531668.934600,12.438665,19.634800,0.899350,50.291045,68.433955,66.901160,-533185.160650,...,0.886650,50.433440,68.670750,67.410175,-532916.107700,12.573005,18.957670,0.859855,50.528910,12.0
2017-01-13,68.505835,67.097530,-530672.603700,12.454370,19.566690,0.917675,50.240675,68.482365,67.004385,-531668.934600,...,0.897255,50.396855,68.591655,67.417870,-532262.555650,12.581075,18.929845,0.886650,50.433440,13.0
2017-01-14,68.269850,67.364715,-538083.712750,12.071570,19.577945,1.475970,49.633500,68.505835,67.097530,-530672.603700,...,0.840505,50.311710,68.640970,67.429635,-531019.541800,12.573120,19.031535,0.897255,50.396855,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-25,66.718410,64.287120,-549433.693500,8.738500,25.245575,1.661005,49.808990,66.669490,64.308370,-549484.754651,...,1.041665,50.417515,66.487785,64.217265,-544597.837550,8.885965,24.924235,1.226250,50.100440,2332.0
2023-05-26,66.963870,64.260535,-548329.186300,8.728370,24.888685,1.664180,50.532920,66.718410,64.287120,-549433.693500,...,0.983735,50.467845,66.615400,64.474580,-540035.935151,8.892255,24.468045,1.041665,50.417515,2333.0
2023-05-27,67.359475,64.694715,-555473.855749,8.689365,24.746910,1.551420,49.940885,66.963870,64.260535,-548329.186300,...,1.664575,50.337280,66.704105,64.529280,-536108.517249,8.932130,24.511735,0.983735,50.467845,2334.0
2023-05-28,67.465020,65.408515,-544934.840150,8.500770,25.305115,1.610320,51.208135,67.359475,64.694715,-555473.855749,...,1.662340,51.439680,66.968285,64.473410,-537896.755799,8.999235,24.844560,1.664575,50.337280,2335.0


In [19]:
df_merged = pd.merge(df, df_features_daily_lag[df_features_daily_lag.index.year==2017], on=['Date'], how='right')
display(df_merged)

Unnamed: 0_level_0,id,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Nationality,Points (Total),...,Instrumentalness_weighted_sum_lag_8,Valence_weighted_sum_lag_8,Danceability_weighted_sum_lag_9,Energy_weighted_sum_lag_9,Loudness_weighted_sum_lag_9,Speechiness_weighted_sum_lag_9,Acousticness_weighted_sum_lag_9,Instrumentalness_weighted_sum_lag_9,Valence_weighted_sum_lag_9,trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-10,7vRriwrloYVaoAe3a9wJHe,0.492,0.275,-13.40,0.300,0.687,0.0,0.180,United States,130,...,0.828535,50.73002,69.377595,68.32421,-528449.376735,12.65788,18.185580,0.849185,52.479965,10.0
2017-01-10,3B7udSGy2PfgoCniMSb523,0.581,0.055,-20514.00,0.040,0.559,0.0,0.234,United Kingdom,58,...,0.828535,50.73002,69.377595,68.32421,-528449.376735,12.65788,18.185580,0.849185,52.479965,10.0
2017-01-10,17Fd6Yb7mSbinKG8LoWfFl,0.663,0.713,-6647.00,0.112,0.035,0.0,0.690,United States,149,...,0.828535,50.73002,69.377595,68.32421,-528449.376735,12.65788,18.185580,0.849185,52.479965,10.0
2017-01-10,4Q4jmPHwu0wrJvqrld0FQ6,0.486,0.713,-3949.00,0.052,0.085,0.0,0.297,Sweden,173,...,0.828535,50.73002,69.377595,68.32421,-528449.376735,12.65788,18.185580,0.849185,52.479965,10.0
2017-01-10,6AGON2BGdPmPMJGiiNuuwl,0.744,0.430,-10604.00,0.309,0.455,0.0,0.266,United States,23,...,0.828535,50.73002,69.377595,68.32421,-528449.376735,12.65788,18.185580,0.849185,52.479965,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-31,5knuzwU65gJK7IF5yJsuaW,0.720,0.763,-4068.00,0.052,0.406,0.0,0.742,United Kingdom,100,...,0.508650,55.29211,69.317020,63.60904,-609037.971415,9.29312,24.105335,0.459550,52.222660,361.0
2017-12-31,7wGoVu4Dady5GV0Sv4UIsx,0.577,0.522,-6594.00,0.098,0.130,0.0,0.119,United States,200,...,0.508650,55.29211,69.317020,63.60904,-609037.971415,9.29312,24.105335,0.459550,52.222660,361.0
2017-12-31,7wGoVu4Dady5GV0Sv4UIsx,0.577,0.522,-6594.00,0.098,0.130,0.0,0.119,United Kingdom,200,...,0.508650,55.29211,69.317020,63.60904,-609037.971415,9.29312,24.105335,0.459550,52.222660,361.0
2017-12-31,7sO5G9EABYOXQKNPNiE9NR,0.880,0.428,-8.28,0.206,0.149,0.0,0.333,United States,145,...,0.508650,55.29211,69.317020,63.60904,-609037.971415,9.29312,24.105335,0.459550,52.222660,361.0


In [20]:
X = df.drop(['id','Points (Total)', 'Date_Interval'], axis=1)
y = df['Points (Total)']

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,random_state=0)

preprocessor = make_column_transformer(
    (StandardScaler(),
    make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(sparse_output=False),
    make_column_selector(dtype_include=object))
)

X_train_std = preprocessor.fit_transform(X_train)
X_val_std =  preprocessor.transform(X_val)

In [21]:
X = df_merged.drop(['id','Points (Total)', 'Date_Interval'], axis=1)
y = df_merged['Points (Total)']

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,random_state=0)

preprocessor = make_column_transformer(
    (StandardScaler(),
    make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(sparse_output=False),
    make_column_selector(dtype_include=object))
)

X_train_std = preprocessor.fit_transform(X_train)
X_val_std =  preprocessor.transform(X_val)

#### Models

In [22]:
# !pip install graphviz

In [23]:
clf = DecisionTreeRegressor(random_state=0).fit(X_train_std, y_train)
preds = clf.predict(X_val_std)

fits = clf.predict(X_train_std)
cal_mae(y_train, fits)
preds = clf.predict(X_val_std)
cal_mae(y_val, preds)

MAE: 0.0
MAE: 6.640605072977186


In [24]:
clf = DecisionTreeRegressor(random_state=0).fit(X_train_std, y_train)
preds = clf.predict(X_val_std)

fits = clf.predict(X_train_std)
cal_mae(y_train, fits)
preds = clf.predict(X_val_std)
cal_mae(y_val, preds)

MAE: 0.0
MAE: 6.640605072977186


In [25]:
clf.get_depth()

38

By default, the decision tree would overfit the data when the max_depth is not set

In [26]:
clf2 = DecisionTreeRegressor(random_state=0, max_depth=20).fit(X_train_std, y_train)

fits = clf2.predict(X_train_std)
cal_mae(y_train, fits)
preds = clf2.predict(X_val_std)
cal_mae(y_val, preds)

MAE: 4.233739093366722
MAE: 9.347437061671886


In [27]:
clf2 = DecisionTreeRegressor(random_state=0, max_depth=20).fit(X_train_std, y_train)

fits = clf2.predict(X_train_std)
cal_mae(y_train, fits)
preds = clf2.predict(X_val_std)
cal_mae(y_val, preds)

MAE: 4.233739093366722
MAE: 9.347437061671886


## try some advanced model next

In [28]:
### SVR tooo long to run

In [29]:
# from sklearn.svm import SVR

# svr_regressor = SVR(kernel='rbf', gamma='auto')
# svr_regressor.fit(X_train_std, y_train)

In [30]:
# fits = svr_regressor.predict(X_train_std)
# cal_mae(y_train,fits)
# preds = svr_regressor.predict(X_val_std)
# cal_mae(y_val, preds)

### XGBoost

In [31]:
from xgboost import XGBRegressor

# create an xgboost regression model
model = XGBRegressor(
    n_estimators=1000, 
    max_depth=10, 
    eta=0.1, 
    subsample=0.7, 
    colsample_bytree=0.8,
)
model.fit(X_train_std, y_train)

In [32]:
fits = model.predict(X_train_std)
cal_mae(y_train,fits)
preds = model.predict(X_val_std)
cal_mae(y_val, preds)

MAE: 0.5133298835093281
MAE: 5.010698487576078


In [33]:
fits = model.predict(X_train_std)
cal_mae(y_train,fits)
preds = model.predict(X_val_std)
cal_mae(y_val, preds)

MAE: 0.5133298835093281
MAE: 5.010698487576078


# How famous the Artists are

In [34]:
df_artists = pd.DataFrame(spotify["Artists"].value_counts())
df_artists.describe()

Unnamed: 0,count
count,2928.0
mean,222.655738
std,573.348364
min,1.0
25%,6.0
50%,44.0
75%,218.25
max,12289.0


In [35]:
df_artists["isPop"] = df_artists["count"]>=218
df_artists

Unnamed: 0_level_0,count,isPop
Artists,Unnamed: 1_level_1,Unnamed: 2_level_1
Ed Sheeran,12289,True
Post Malone,7724,True
XXXTENTACION,6924,True
Billie Eilish,6756,True
Bad Bunny,5756,True
...,...,...
The Beatles,1,False
Ozzy Osbourne,1,False
Jesy Nelson,1,False
Green Day,1,False


In [36]:
df_merged_2 = pd.merge(df_merged, df_artists, on=['Artists'], how='left')
df_merged_2

KeyError: 'Artists'