In [None]:
#calculating a Dummy classfier - baseline for accuracy
from sklearn.model_selection import train_test_split
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

# Print column names to inspect
cleaned_df = pd.read_excel('SpotifyDataIt3.xlsx')

print(cleaned_df.columns)



In the code below, we initiate the training of DecisionTreeRegressor (DT) and split the data into training and test

In [73]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Selecting relevant columns for the model
# Assuming 'streams' is the target variable and excluding highly categorical columns for simplicity
relevant_columns = ['artists_num', 'album_num_tracks', 'peak_rank', 'weeks_on_chart','streams', 'danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','duration', 'release_year', 'release_month', 'release_day',
'release_dayofweek', 'loudness_log', 'energy_danceability_interaction',]
data_for_model = cleaned_df[relevant_columns]


# Splitting the data into training and testing sets
X = data_for_model.drop('streams', axis=1)
y = data_for_model['streams']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing and training the Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(max_depth=15, min_samples_split=75, min_samples_leaf=25, random_state=0)
dt_regressor.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)



print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)



Mean Squared Error: 3080239127759216.5
Root Mean Squared Error: 55499902.0517984
R-squared: 0.22605412631272004


To optimize the model within a practical time frame, we can try a few adjustments:

Let's start with a reduced and more focused hyperparameter space for the Decision Tree Regressor

this method is also calling pre-pruning 

In [75]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for Random Forest
param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize a Random Forest Regressor
dt_regressor = DecisionTreeRegressor(random_state=0)

# Create the GridSearchCV object
grid_search_dt = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit GridSearchCV
grid_search_dt.fit(X_train, y_train)

# Retrieve the best Decision Tree
best_dt_model = grid_search_dt.best_estimator_

# Make predictions using the best model on the test data
y_pred_test = best_dt_model.predict(X_test)

# Calculate MSE, RMSE R-squared 
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)



print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Test R-squared:", r2)


Mean Squared Error: 3623844136784960.0
Root Mean Squared Error: 60198373.20713044
Test R-squared: 0.22605412631272004


Keep the tuning - reducing the set of parameters for tuning 

In [83]:
from sklearn.model_selection import GridSearchCV

# Define a reduced set of parameters for tuning
param_grid_reduced = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Create a GridSearchCV object with reduced parameter grid
grid_search_reduced = GridSearchCV(estimator=dt_regressor, param_grid=param_grid_reduced, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Perform the grid search
grid_search_reduced.fit(X_train, y_train)

# Best parameters and best MSE
best_params = grid_search_reduced.best_params_
best_mse = -grid_search_reduced.best_score_


# Best parameters, MSE and r-squared on the test data 
best_params_reduced = grid_search_reduced.best_params_
best_mse_reduced = -grid_search_reduced.best_score_
best_rmse_reduced = np.sqrt(best_mse_reduced)
best_r2 = r2_score(y_test, y_test_pred)


print('best params:', best_params_reduced)
print('best_MSE_reduced:', best_mse_reduced)
print('best_RMSE_reduced:', best_rmse_reduced)
print('R-Squared:', best_r2)

best params: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
best_MSE_reduced: 3107075183893057.5
best_RMSE_reduced: 55741144.443696685
R-Squared: 0.3463084235192101


RANDOM FORREST SECTION

now im looking at Random Forrest Regressor 

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Selecting relevant columns for the model
# Assuming 'streams' is the target variable and excluding highly categorical columns for simplicity
relevant_columns = ['artists_num', 'album_num_tracks', 'peak_rank', 'weeks_on_chart','streams', 'danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','duration', 'release_year', 'release_month', 'release_day',
'release_dayofweek', 'loudness_log', 'energy_danceability_interaction',]
data_for_model = cleaned_df[relevant_columns]


# Initializing and training the Random Forrest Regressor
rf_regressor = RandomForestRegressor(max_depth=15, min_samples_split=75, min_samples_leaf=25, random_state=0)
rf_regressor.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = rf_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)



print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)



Mean Squared Error: 2725889735206679.5
Root Mean Squared Error: 52210053.96670913
R-squared: 0.3150885287843025


To optimize the model within a practical time frame, we can try a few adjustments:

Let's start with a reduced and more focused hyperparameter space for the Random Forest Regressor

this method is also calling pre-pruning

In [89]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for Random Forest
param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
    # Add other parameters if needed
}

# Initialize a Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=0)

# Create the GridSearchCV object
grid_search_rf = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Retrieve the best Random Forest model
best_rf_model = grid_search_rf.best_estimator_

# Make predictions using the best model on the train data
y_pred_test = best_rf_model.predict(X_test)

# Calculate MSE and R-squared for the train data
test_mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_pred_test)

print("Best Random Forest Model Parameters:", grid_search_rf.best_params_)
print("Root Mean Squared Error:", rmse)
print("test MSE:", test_mse)
print("test R-squared:", test_r2)


Best Random Forest Model Parameters: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5}
Root Mean Squared Error: 51237551.40769241
test MSE: 2625286674255922.5
test R-squared: 0.3403662168707433


Keep the tuning - reducing the set of parameters for tuning 

In [90]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

param_grid = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
    # Add other parameters if needed
}


# Create a GridSearchCV object with the same parameter grid and model
grid_search_reduced = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Perform the grid search
grid_search_reduced.fit(X_train, y_train)

# Retrieve the best model and its parameters
best_rf_model_reduced = grid_search_reduced.best_estimator_
best_params_reduced = grid_search_reduced.best_params_

# Make predictions on the test data
y_pred_test_reduced = best_rf_model_reduced.predict(X_test)

# Calculate R-squared for the test data
test_r2_reduced = r2_score(y_test, y_pred_test_reduced)
test_mse_reduced = mean_squared_error(y_test, y_pred_test_reduced)
rmse = np.sqrt(test_mse_reduced)

print("Best Parameters:", best_params_reduced)
print("Root Mean Squared Error:", rmse)
print("test MSE:", test_mse_reduced)
print("Test R-squared:", test_r2_reduced)


Best Parameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2}
Root Mean Squared Error: 51201941.90839681
test MSE: 2621638855190841.5
Test R-squared: 0.34128277379904626


Lets have a look at how linear regression perform:


In [94]:
#Importing necessary libraries and loading the dataset
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Training the model
linear_model.fit(X_train, y_train)

# Predicting the test set results
y_pred_linear = linear_model.predict(X_test)

# Evaluating the model
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Returning the evaluation metrics

print("Root Mean Squared Error:", rmse_linear)
print("test MSE:", mse_linear )
print("Test R-squared:", r2_linear)

Root Mean Squared Error: 60418432.648560435
test MSE: 3650387003708633.5
Test R-squared: 0.08279784727712935


First of we cross validate the linear regression model to evalute the models performance

In [95]:
from sklearn.model_selection import cross_val_score

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Perform cross-validation
cv_scores = cross_val_score(linear_model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert MSE scores to RMSE scores
rmse_scores = np.sqrt(-cv_scores)

# Output the cross-validation RMSE scores
rmse_scores

array([61466224.18909626, 62199008.53080785, 59645209.04620065,
       62158663.4592537 , 62029685.81215119])

In [96]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LassoCV

# Start by finding the best alpha for the L1 regression to optimize the model
# Setting up a range of alpha values to test
alphas = np.logspace(-4, 0.01, 10)

# Initialize and train the LassoCV model to find the best alpha
lasso_cv_model = LassoCV(alphas=alphas, cv=5, random_state=42)
lasso_cv_model.fit(X_train, y_train)

# Optimal alpha found by LassoCV
optimal_alpha = lasso_cv_model.alpha_

# Initialize the Lasso regression model (L1 regularization)
# Setting the alpha to the optimal alpha found above
lasso_model = Lasso(alpha=optimal_alpha, random_state=30)

# Training the Lasso regression model
lasso_model.fit(X_train, y_train)

# Predicting the test set results
y_pred_lasso = lasso_model.predict(X_test)

# Evaluating the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f'MSE (Lasso): {mse_lasso}, RMSE (Lasso): {rmse_lasso}, R-squared (Lasso): {r2_lasso}')

MSE (Lasso): 3650387071617491.0, RMSE (Lasso): 60418433.21054834, R-squared (Lasso): 0.08279783021423981


To try further explore the model and make it better we implement a GridSearch for the Lasso regression model below:

In [97]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

# Assuming X_train, y_train, X_test, y_test are already defined
# Define the parameter grid for GridSearch
param_grid = {
    'alpha': [0.1, 1, 10, 100],
    'max_iter': [1000, 5000, 10000],
    'tol': [0.0001, 0.001, 0.01]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=Lasso(random_state=42), param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters
best_parameters = grid_search.best_params_

# Output the best parameters
best_parameters

{'alpha': 100, 'max_iter': 1000, 'tol': 0.0001}

Here we use the "best parameters" to see the change in MSE, RMSE and R-squared

In [101]:
# Initialize the Lasso regression model with the best parameters
best_lasso_model = Lasso(**best_parameters, random_state=42)

# Training the Lasso regression model with the best parameters
best_lasso_model.fit(X_train, y_train)

# Predicting the test set results using the best model
y_pred_best_lasso = best_lasso_model.predict(X_test)

# Evaluating the best model
mse_best_lasso = mean_squared_error(y_test, y_pred_best_lasso)
rmse_best_lasso = np.sqrt(mse_best_lasso)
r2_best_lasso = r2_score(y_test, y_pred_best_lasso)

print(f'MSE (Lasso): {mse_best_lasso}, RMSE (Lasso): {rmse_best_lasso}, R-squared (Lasso): {r2_best_lasso}')

MSE (Lasso): 3650393662497365.0, RMSE (Lasso): 60418487.75414165, R-squared (Lasso): 0.08279617417908447


Now we try to experiment with gradiant booster regressor 

In [102]:
#Here we create a gradient tree boosting regressor model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred_gbr = gbr_model.predict(X_test)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

mse_gbr, rmse_gbr, r2_gbr
print(f'MSE (gbr): {mse_gbr}, RMSE (gbr): {rmse_gbr}, R-squared (gbr): {r2_gbr}')

MSE (gbr): 2871076330363802.5, RMSE (gbr): 53582425.57372522, R-squared (gbr): 0.27860870966119944


Now we implement random search for Gradient Booster Regressor

In [103]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform, randint
import numpy as np

# Assuming X_train, y_train, X_test, y_test are already defined

# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(2, 6),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'subsample': uniform(0.8, 0.2)
}

# Create RandomizedSearchCV object for GradientBoostingRegressor
random_search_gbr = RandomizedSearchCV(estimator=GradientBoostingRegressor(random_state=42), 
                                       param_distributions=param_distributions, 
                                       n_iter=10, 
                                       scoring='neg_mean_squared_error', 
                                       cv=5, 
                                       random_state=42,
                                       verbose=1)

# Fit RandomizedSearchCV
random_search_gbr.fit(X_train, y_train)

# Best parameters
best_parameters_gbr = random_search_gbr.best_params_

# Using the best parameters to train and evaluate the model
best_gbr_model = GradientBoostingRegressor(**best_parameters_gbr, random_state=42)
best_gbr_model.fit(X_train, y_train)

# Predicting and evaluating the best model
y_pred_best_gbr = best_gbr_model.predict(X_test)
mse_best_gbr = mean_squared_error(y_test, y_pred_best_gbr)
rmse_best_gbr = np.sqrt(mse_best_gbr)
r2_best_gbr = r2_score(y_test, y_pred_best_gbr)

# Print the evaluation metrics for the best model
print(f'MSE (Best GBR): {mse_best_gbr}, RMSE (Best GBR): {rmse_best_gbr}, R-squared (Best GBR): {r2_best_gbr}')
print(f'Best Parameters: {best_parameters_gbr}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
MSE (Best GBR): 2590181037793240.0, RMSE (Best GBR): 50893821.21430106, R-squared (Best GBR): 0.3491869159647206
Best Parameters: {'learning_rate': 0.13022300234864176, 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 7, 'n_estimators': 102, 'subsample': 0.9939819704323989}


Now we wanna try to do the same, but using XGBoost Regressor 


In [104]:
#Implementing a XGBoost model
import xgboost as xgb

# Initialize and train the XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

mse_xgb, rmse_xgb, r2_xgb

(2836900423627351.0, 53262561.18163443, 0.287195803357863)

Here we use gridsearch to further investigate and tune the XGBoost model

In [106]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define a range of hyperparameters for GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Create GridSearchCV object for XGBRegressor
grid_search_xgb = GridSearchCV(estimator=XGBRegressor(random_state=42), 
                               param_grid=param_grid, 
                               scoring='neg_mean_squared_error', 
                               cv=5, 
                               verbose=1)

# Fit GridSearchCV
grid_search_xgb.fit(X_train, y_train)

# Best parameters
best_parameters_xgb = grid_search_xgb.best_params_

# Output the best parameters
best_parameters_xgb

# Using the best parameters to train and evaluate the model
best_xgb_model = XGBRegressor(**best_parameters_xgb, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Predicting and evaluating the best model
y_pred_best_xgb = best_xgb_model.predict(X_test)
mse_best_xgb = mean_squared_error(y_test, y_pred_best_xgb)
rmse_best_xgb = np.sqrt(mse_best_xgb)
r2_best_xgb = r2_score(y_test, y_pred_best_xgb)

# Print the evaluation metrics for the best model
print('Best Mean Square Error:', mse_best_xgb) 
print('Best Root Mean Square Error:', rmse_best_xgb) 
print('Best R-squared:', r2_best_xgb)



Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best Mean Square Error: 2534497509003570.5
Best Root Mean Square Error: 50343793.15271715
Best R-squared: 0.3631780496240292
