In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

# Print column names to inspect if everything looks fine 
cleaned_df = pd.read_excel('SpotifyDataCleaned.xlsx')

print(cleaned_df.columns)


Index(['artist_names', 'artists_num', 'artist_genre', 'collab', 'release_date',
       'album_num_tracks', 'source', 'peak_rank', 'weeks_on_chart', 'streams',
       'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration'],
      dtype='object')


In the code below, we initiate the training of DecisionTreeRegressor (DT) and split the data into training and test and then test the model with the test set 

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Selecting relevant columns for the model
# Assuming 'streams' is the target variable and excluding highly categorical columns for simplicity
relevant_columns = ['artists_num', 'collab', 'album_num_tracks', 'peak_rank', 'weeks_on_chart', 'streams',
                    'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
                    'liveness', 'valence', 'tempo', 'duration']
data_for_model = cleaned_df[relevant_columns]


# Splitting the data into training and testing sets
X = data_for_model.drop('streams', axis=1)
y = data_for_model['streams']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing and training the Decision Tree Regressor and adjusting model complexity
dt_regressor = DecisionTreeRegressor(max_depth=50, min_samples_split=150, min_samples_leaf=50, random_state=0)
dt_regressor.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  
r2 = r2_score(y_test, y_pred)


print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)


Mean Squared Error: 2.8275835213905212e+16
Root Mean Squared Error: 168154200.70252547
R-squared: 0.10161313165271346


We wanna do the same approach but for the random forrest regression model. We are training the model on the training set and afterwards predicting and evaluating the model

In [27]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Initializing and training the Random Forest Regressor
rf_regressor = RandomForestRegressor(max_depth=15, min_samples_split=75, min_samples_leaf=25, n_estimators=100, random_state=0)
rf_regressor.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = rf_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  
r2 = r2_score(y_test, y_pred)


print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)






Mean Squared Error: 2.6547973113527316e+16
Root Mean Squared Error: 162935487.58182582
R-squared: 0.1565111960087786


The code below are training the Linear Regression model, and evaluating the model 

In [28]:
#Importing necessary libraries and loading the dataset
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Training the model
linear_model.fit(X_train, y_train)

# Predicting the test set results
y_pred_linear = linear_model.predict(X_test)

# Evaluating the model
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Returning the evaluation metrics
print(f'MSE (linear): {mse_linear}, RMSE (linear): {rmse_linear}, R-squared (linear): {r2_linear}')

MSE (linear): 2.921563735466976e+16, RMSE (linear): 170925824.13043898, R-squared (linear): 0.07175350431649485


Now lets try to improve Linear Regression by implementing L1 / Lasso

In [29]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LassoCV

# Start by finding the best alpha for the L1 regression to optimize the model
# Setting up a range of alpha values to test
alphas = np.logspace(-4, 0.01, 10)

# Initialize and train the LassoCV model to find the best alpha
lasso_cv_model = LassoCV(alphas=alphas, cv=5, random_state=42)
lasso_cv_model.fit(X_train, y_train)

# Optimal alpha found by LassoCV
optimal_alpha = lasso_cv_model.alpha_

# Initialize the Lasso regression model (L1 regularization)
# Setting the alpha to the optimal alpha found above
lasso_model = Lasso(alpha=optimal_alpha, random_state=30)

# Training the Lasso regression model
lasso_model.fit(X_train, y_train)

# Predicting the test set results
y_pred_lasso = lasso_model.predict(X_test)

# Evaluating the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f'MSE (Lasso): {mse_lasso}, RMSE (Lasso): {rmse_lasso}, R-squared (Lasso): {r2_lasso}')

MSE (Lasso): 2.9215637367254496e+16, RMSE (Lasso): 170925824.16725242, R-squared (Lasso): 0.07175350391664959


In [34]:
#Here we create a gradient tree boosting
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred_gbr = gbr_model.predict(X_test)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(f'MSE (gbr): {mse_gbr}, RMSE (gbr): {rmse_gbr}, R-squared (gbr): {r2_gbr}')

MSE (gbr): 2.710798762486656e+16, RMSE (gbr): 164645035.22689822, R-squared (gbr): 0.13871827568422945


In [38]:
#Implementing a XGBoost model
import xgboost as xgb

# Initialize and train the XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'MSE (xgb): {mse_xgb}, RMSE (xgb): {rmse_xgb}, R-squared (xgb): {r2_xgb}')

MSE (xgb): 2.68897495185413e+16, RMSE (xgb): 163980942.54681334, R-squared (xgb): 0.14565219107213545
