In [72]:
#calculating a Dummy classfier - baseline for accuracy
from sklearn.model_selection import train_test_split
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

# Print column names to inspect
cleaned_df = pd.read_excel('SpotifyDataIt3.xlsx')

print(cleaned_df.columns)



Index(['artists_num', 'album_num_tracks', 'peak_rank', 'weeks_on_chart',
       'streams', 'danceability', 'energy', 'key', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration', 'release_year', 'release_month', 'release_day',
       'release_dayofweek', 'loudness_log', 'energy_danceability_interaction'],
      dtype='object')


In the code below, we initiate the training of DecisionTreeRegressor (DT) and split the data into training and test

In [73]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Selecting relevant columns for the model
# Assuming 'streams' is the target variable and excluding highly categorical columns for simplicity
relevant_columns = ['artists_num', 'album_num_tracks', 'peak_rank', 'weeks_on_chart','streams', 'danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','duration', 'release_year', 'release_month', 'release_day',
'release_dayofweek', 'loudness_log', 'energy_danceability_interaction',]
data_for_model = cleaned_df[relevant_columns]


# Splitting the data into training and testing sets
X = data_for_model.drop('streams', axis=1)
y = data_for_model['streams']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing and training the Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(max_depth=15, min_samples_split=75, min_samples_leaf=25, random_state=0)
dt_regressor.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)



print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)



Mean Squared Error: 3080239127759216.5
Root Mean Squared Error: 55499902.0517984
R-squared: 0.22605412631272004


To optimize the model within a practical time frame, we can try a few adjustments:

Let's start with a reduced and more focused hyperparameter space for the Decision Tree Regressor

this method is also calling pre-pruning 

In [75]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for Random Forest
param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize a Random Forest Regressor
dt_regressor = DecisionTreeRegressor(random_state=0)

# Create the GridSearchCV object
grid_search_dt = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit GridSearchCV
grid_search_dt.fit(X_train, y_train)

# Retrieve the best Decision Tree
best_dt_model = grid_search_dt.best_estimator_

# Make predictions using the best model on the test data
y_pred_test = best_dt_model.predict(X_test)

# Calculate MSE, RMSE R-squared 
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)



print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Test R-squared:", r2)


Mean Squared Error: 3623844136784960.0
Root Mean Squared Error: 60198373.20713044
Test R-squared: 0.22605412631272004


Keep the tuning - reducing the set of parameters for tuning 

In [83]:
from sklearn.model_selection import GridSearchCV

# Define a reduced set of parameters for tuning
param_grid_reduced = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Create a GridSearchCV object with reduced parameter grid
grid_search_reduced = GridSearchCV(estimator=dt_regressor, param_grid=param_grid_reduced, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Perform the grid search
grid_search_reduced.fit(X_train, y_train)

# Best parameters and best MSE
best_params = grid_search_reduced.best_params_
best_mse = -grid_search_reduced.best_score_


# Best parameters, MSE and r-squared on the test data 
best_params_reduced = grid_search_reduced.best_params_
best_mse_reduced = -grid_search_reduced.best_score_
best_rmse_reduced = np.sqrt(best_mse_reduced)
best_r2 = r2_score(y_test, y_test_pred)


print('best params:', best_params_reduced)
print('best_MSE_reduced:', best_mse_reduced)
print('best_RMSE_reduced:', best_rmse_reduced)
print('R-Squared:', best_r2)

best params: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
best_MSE_reduced: 3107075183893057.5
best_RMSE_reduced: 55741144.443696685
R-Squared: 0.3463084235192101


RANDOM FORREST SECTION

now im looking at Random Forrest Regressor 

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Selecting relevant columns for the model
# Assuming 'streams' is the target variable and excluding highly categorical columns for simplicity
relevant_columns = ['artists_num', 'album_num_tracks', 'peak_rank', 'weeks_on_chart','streams', 'danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','duration', 'release_year', 'release_month', 'release_day',
'release_dayofweek', 'loudness_log', 'energy_danceability_interaction',]
data_for_model = cleaned_df[relevant_columns]


# Initializing and training the Random Forrest Regressor
rf_regressor = RandomForestRegressor(max_depth=15, min_samples_split=75, min_samples_leaf=25, random_state=0)
rf_regressor.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = rf_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)



print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)



Mean Squared Error: 2725889735206679.5
Root Mean Squared Error: 52210053.96670913
R-squared: 0.3150885287843025


To optimize the model within a practical time frame, we can try a few adjustments:

Let's start with a reduced and more focused hyperparameter space for the Random Forest Regressor

this method is also calling pre-pruning

In [89]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for Random Forest
param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
    # Add other parameters if needed
}

# Initialize a Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=0)

# Create the GridSearchCV object
grid_search_rf = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Retrieve the best Random Forest model
best_rf_model = grid_search_rf.best_estimator_

# Make predictions using the best model on the train data
y_pred_test = best_rf_model.predict(X_test)

# Calculate MSE and R-squared for the train data
test_mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_pred_test)

print("Best Random Forest Model Parameters:", grid_search_rf.best_params_)
print("Root Mean Squared Error:", rmse)
print("test MSE:", test_mse)
print("test R-squared:", test_r2)


Best Random Forest Model Parameters: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5}
Root Mean Squared Error: 51237551.40769241
test MSE: 2625286674255922.5
test R-squared: 0.3403662168707433


Keep the tuning - reducing the set of parameters for tuning 

In [90]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

param_grid = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
    # Add other parameters if needed
}


# Create a GridSearchCV object with the same parameter grid and model
grid_search_reduced = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Perform the grid search
grid_search_reduced.fit(X_train, y_train)

# Retrieve the best model and its parameters
best_rf_model_reduced = grid_search_reduced.best_estimator_
best_params_reduced = grid_search_reduced.best_params_

# Make predictions on the test data
y_pred_test_reduced = best_rf_model_reduced.predict(X_test)

# Calculate R-squared for the test data
test_r2_reduced = r2_score(y_test, y_pred_test_reduced)
test_mse_reduced = mean_squared_error(y_test, y_pred_test_reduced)
rmse = np.sqrt(test_mse_reduced)

print("Best Parameters:", best_params_reduced)
print("Root Mean Squared Error:", rmse)
print("test MSE:", test_mse_reduced)
print("Test R-squared:", test_r2_reduced)


Best Parameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2}
Root Mean Squared Error: 51201941.90839681
test MSE: 2621638855190841.5
Test R-squared: 0.34128277379904626
