In [None]:
#Importing relevant libraries for the further programming. 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#Read Dataset
cleaned_df = pd.read_excel('SpotifyDataEngineer.xlsx')


# Print column names to inspect
cleaned_df


Strangely, a duplicated column was created and therefore had to be removed and was named "Unnamed"

In [50]:
#Remove columns "unnamed"

unnamed_columns = [col for col in cleaned_df.columns if 'Unnamed' in col]
cleaned_df.drop(columns=unnamed_columns, inplace=True)

In [51]:
#Save the changes in the newest dataset 

cleaned_df.to_excel('SpotifyDataEngineer.xlsx')



In [None]:
#Once again check if everything look normal, then after continue modelling.
print(cleaned_df.columns)

In the code below, we initiate the training of DecisionTreeRegressor (DT), split the data into training and test and then test the model with the test set 

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Selecting relevant columns for the model
# 'streams' is the target variable and excluding highly categorical columns for simplicity
relevant_columns = ['artists_num', 'album_num_tracks', 'peak_rank', 'weeks_on_chart','streams', 'danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','duration', 'release_year', 'release_month', 'release_day',
'release_dayofweek', 'loudness_log', 'energy_danceability_interaction',]
data_for_model = cleaned_df[relevant_columns]


# Splitting the data into training and testing sets
X = data_for_model.drop('streams', axis=1)
y = data_for_model['streams']

# Test size is 30% and training are 70%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing and training the Decision Tree Regressor and adjusting model complexity
# Models complexity have been selected based on several tries, and what performed the "best".
dt_regressor = DecisionTreeRegressor(max_depth=15, min_samples_split=75, min_samples_leaf=25, random_state=0)

#Fitting the modl
dt_regressor.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = dt_regressor.predict(X_test)

#Calculating MSE, RMSE and R-squared 
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) 
r2 = r2_score(y_test, y_pred)

# Returning the evaluation metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Squared Error: 2.6751889624311544e+16
Root Mean Squared Error: 163560048.98602697
R-squared: 0.15003230991604688


Lets look at Random Forrest Regressor

We wanna do the same approach as above but for the random forrest regression model. We are training the model on the training set, and evaluating the performance

In [43]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Initializing and training the Random Forrest Regressor and adjusting model complexity
# Models complexity have been selected based on several tries, and what performed the "best".
rf_regressor = RandomForestRegressor(max_depth=10, min_samples_split=60, min_samples_leaf=15, random_state=0)

#Fitting the model
rf_regressor.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = rf_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) 
r2 = r2_score(y_test, y_pred)

# Returning the evaluation metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)


Mean Squared Error: 2.326696626405144e+16
Root Mean Squared Error: 152535131.24540013
R-squared: 0.2607561615855013


Now we wanna try further machine learning modelling, where the focus is on linear regression

In [54]:
#Importing necessary libraries and loading the dataset
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Training the model
linear_model.fit(X_train, y_train)

# Predicting the test set results
y_pred_linear = linear_model.predict(X_test)

# Evaluating the model
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Returning the evaluation metrics
print(f'MSE (linear): {mse_linear}, RMSE (linear): {rmse_linear}, R-squared (linear): {r2_linear}')

(2.913998291284818e+16, 170704372.8580149, 0.07415721605658965)

Once again, we wanna try to optimize the linear regression model by using L1

In [55]:
#Importing relevant libraries 
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LassoCV

# Start by finding the best alpha for the L1 regression to optimize the model
# Setting up a range of alpha values to test
alphas = np.logspace(-4, 0.01, 10)

# Initialize and train the LassoCV model to find the best alpha
lasso_cv_model = LassoCV(alphas=alphas, cv=5, random_state=42)
lasso_cv_model.fit(X_train, y_train)

# Optimal alpha found by LassoCV
optimal_alpha = lasso_cv_model.alpha_

# Initialize the Lasso regression model (L1 regularization)
# Setting alpha to a small value like 0.01 for a start. This can be tuned further.
lasso_model = Lasso(alpha=optimal_alpha, random_state=30)

# Training the Lasso regression model
lasso_model.fit(X_train, y_train)

# Predicting the test set results
y_pred_lasso = lasso_model.predict(X_test)

# Evaluating the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f'MSE (Lasso): {mse_lasso}, RMSE (Lasso): {rmse_lasso}, R-squared (Lasso): {r2_lasso}')

MSE (Lasso): 2.913998307765096e+16, RMSE (Lasso): 170704373.3407289, R-squared (Lasso): 0.0741572108204348


Now lets try to use another model for testing the results. In the section below, the gradient tree boosting model is trained and tested. 

In [56]:
#Here we create a gradient tree boosting and important relevant libraries
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Initialize and train the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred_gbr = gbr_model.predict(X_test)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

# Returning the evaluation metrics
print(f'MSE (gbr): {mse_gbr}, RMSE (gbr): {rmse_gbr}, R-squared (gbr): {r2_gbr}')

MSE (gbr): 2.3596686779074388e+16, RMSE (gbr): 153612130.9632621, R-squared (gbr): 0.2502801993838384


Final in iteration 2, lets have a look at XGBoostRegressor to compare the difference performance between Gradient Booster 

In [59]:
#Implementing a XGBoost model
import xgboost as xgb

# Initialize and train the XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Returning the evaluation metrics
print(f'MSE (xgb): {mse_xgb}, RMSE (xgb): {rmse_xgb}, R-squared (xgb): {r2_xgb}')

MSE (xgb): 2.3496765207927756e+16, RMSE (xgb): 153286546.07605898, R-squared (xgb): 0.2534549323918528
