In [17]:
#Import relevant libraries. 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Read data 
cleaned_df = pd.read_excel('SpotifyDataCleaned.xlsx')

# Print column names to inspect if everything looks fine

print(cleaned_df.columns)

cleaned_df


Index(['artists_num', 'collab', 'release_date', 'album_num_tracks', 'source',
       'peak_rank', 'weeks_on_chart', 'streams', 'danceability', 'energy',
       'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration'],
      dtype='object')


Unnamed: 0,artists_num,collab,release_date,album_num_tracks,source,peak_rank,weeks_on_chart,streams,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration
0,1,0,2021-04-22,15,Som Livre,4,36.50000,37158272,0.752,0.620,5,-5.536,0.0509,0.30900,0.000000,0.0750,0.962,117.399,164459
1,2,1,2017-03-23,1,RCA Records Label,46,9.00000,215055522,0.748,0.627,7,-6.029,0.0639,0.13100,0.000000,0.0852,0.524,120.963,188491
2,1,0,2022-02-14,13,SM Entertainment,61,2.00000,48580,0.585,0.683,8,-3.827,0.0523,0.73700,0.000000,0.1140,0.493,128.018,157987
3,1,0,2018-06-15,1,Def Jam Recordings,112,1.50000,9944865,0.353,0.755,1,-6.276,0.7330,0.08220,0.000000,0.3900,0.437,191.153,193680
4,1,0,2004,12,Island Records,44,129.30303,429376201,0.352,0.911,1,-5.230,0.0747,0.00121,0.000000,0.0995,0.236,148.033,222973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26252,1,0,2020-12-18,1,3Beat,4,27.00000,6885448,0.812,0.634,8,-4.960,0.1630,0.05640,0.000000,0.2760,0.463,124.074,174194
26253,3,1,2022-02-04,1,Lais Records,7,10.00000,421278,0.792,0.469,8,-7.666,0.1070,0.33300,0.000000,0.0725,0.353,114.017,144737
26254,1,0,2018-10-05,7,U OK?,40,8.50000,11300755,0.415,0.504,9,-10.003,0.0318,0.02200,0.000004,0.3630,0.401,165.860,155714
26255,2,1,2017-02-24,11,XO Records,171,1.50000,13137028,0.744,0.715,0,-6.103,0.3510,0.10100,0.000000,0.0919,0.340,154.962,179773


In the code below, we initiate the training of DecisionTreeRegressor (DT) and split the data into training and test and then test the model with the test set 

In [15]:
#Import relevant libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Selecting relevant columns for the model
# 'streams' is the target variable and excluding highly categorical columns for simplicity
relevant_columns = ['artists_num', 'collab', 'album_num_tracks', 'peak_rank', 'weeks_on_chart', 'streams',
                    'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
                    'liveness', 'valence', 'tempo', 'duration']
data_for_model = cleaned_df[relevant_columns]


# Splitting the data into training and testing sets
X = data_for_model.drop('streams', axis=1)
y = data_for_model['streams']
# Test size is 30% and training are 70%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing and training the Decision Tree Regressor and adjusting model complexity
# Models complexity have been selected based on several tries, and what performed the "best".
dt_regressor = DecisionTreeRegressor(max_depth=50, min_samples_split=150, min_samples_leaf=50, random_state=0)

#Fitting the decision tree regressor
dt_regressor.fit(X_train, y_train)

# Predicting the test set results
y_pred = dt_regressor.predict(X_test)

#Calculating MSE, RMSE and R-squared 
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  
r2 = r2_score(y_test, y_pred)

# Returning the evaluation metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Squared Error: 2.8275835213905212e+16
Root Mean Squared Error: 168154200.70252547
R-squared: 0.10161313165271346


The same approach but for the random forrest regression model. We are training the model on the training set and afterwards predicting and evaluating the model

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Initializing and training the Random Forest Regressor and adjusting model complexity
# Models complexity have been selected based on several tries, and what performed the "best".
rf_regressor = RandomForestRegressor(max_depth=15, min_samples_split=75, min_samples_leaf=25, n_estimators=100, random_state=0)

#Fitting the decision tree regressor
rf_regressor.fit(X_train, y_train)

# Predicting the test set results
y_pred = rf_regressor.predict(X_test)

#Calculating MSE, RMSE and R-squared 
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  
r2 = r2_score(y_test, y_pred)

# Returning the evaluation metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Squared Error: 2.6547973113527316e+16
Root Mean Squared Error: 162935487.58182582
R-squared: 0.1565111960087786


The code below are training the Linear Regression model, and evaluating the model 

In [5]:
#Importing necessary libraries and loading the dataset
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Training the model
linear_model.fit(X_train, y_train)

# Predicting the test set results
y_pred_linear = linear_model.predict(X_test)

# Evaluating the model
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Returning the evaluation metrics
print(f'MSE (linear): {mse_linear}, RMSE (linear): {rmse_linear}, R-squared (linear): {r2_linear}')

MSE (linear): 2.921563735466976e+16, RMSE (linear): 170925824.13043898, R-squared (linear): 0.07175350431649485


Now lets try to improve Linear Regression by implementing L1 / Lasso

In [6]:
#Import relevant libraries 
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LassoCV

# Start by finding the best alpha for the L1 regression to optimize the model
# Setting up a range of alpha values to test
alphas = np.logspace(-4, 0.01, 10)

# Initialize and train the LassoCV model to find the best alpha
lasso_cv_model = LassoCV(alphas=alphas, cv=5, random_state=42)

#Fitting the model
lasso_cv_model.fit(X_train, y_train)

# Optimal alpha found by LassoCV
optimal_alpha = lasso_cv_model.alpha_

# Initialize the Lasso regression model (L1 regularization)
# Setting the alpha to the optimal alpha found above
lasso_model = Lasso(alpha=optimal_alpha, random_state=30)

# Training the Lasso regression model
lasso_model.fit(X_train, y_train)

# Predicting the test set results
y_pred_lasso = lasso_model.predict(X_test)

# Evaluating the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Returning the evaluation metrics
print(f'MSE (Lasso): {mse_lasso}, RMSE (Lasso): {rmse_lasso}, R-squared (Lasso): {r2_lasso}')

MSE (Lasso): 2.9215637367254496e+16, RMSE (Lasso): 170925824.16725242, R-squared (Lasso): 0.07175350391664959


Now lets try to use another model for testing the results. In the section below, the gradient tree boosting model is trained and tested. 

In [7]:
#Here we create a gradient tree boosting
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred_gbr = gbr_model.predict(X_test)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

# Returning the evaluation metrics
print(f'MSE (gbr): {mse_gbr}, RMSE (gbr): {rmse_gbr}, R-squared (gbr): {r2_gbr}')

MSE (gbr): 2.710798762486656e+16, RMSE (gbr): 164645035.22689822, R-squared (gbr): 0.13871827568422945


In the next section, XGBoost is tested to see the difference between XGBoost and Gradient. 

In [8]:
#Implementing a XGBoost model
import xgboost as xgb

# Initialize and train the XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Returning the evaluation metrics
print(f'MSE (xgb): {mse_xgb}, RMSE (xgb): {rmse_xgb}, R-squared (xgb): {r2_xgb}')

MSE (xgb): 2.68897495185413e+16, RMSE (xgb): 163980942.54681334, R-squared (xgb): 0.14565219107213545
