In [3]:
# Standard DS imports
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, precision_recall_curve, auc
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
from sklearn import neighbors
from sklearn.svm import SVC
import xgboost as xgb
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
import time
import gc

In [4]:
df = pd.read_csv('./dataset.csv')
print('read in successfully as a dataframe')

read in successfully as a dataframe


In [5]:

df['track_genre_encoded'] = LabelEncoder().fit_transform(df['track_genre'])
df = df.drop(columns=['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'track_genre'])

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(df[['track_genre_encoded', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                                                    'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'explicit']], df[['popularity']], test_size=0.2, random_state=42)

# Training Model 2 KNN-Classifier
ytrain = (ytrain > 40).astype(int)
ytest = (ytest > 40).astype(int)

In [7]:
param_grid = {
    'n_estimators': [100, 500, 750, 1000],  # Number of gradient boosted trees. Equivalent to number of boosting rounds.
    'max_depth': [13, 15, 17],  # Maximum tree depth for base learners.
    'learning_rate': [0.1, 0.2], # Boosting learning rate (xgb’s “eta”)
    'subsample': [0.3, 0.5, 0.7, 0.9], # Subsample ratio of the training instance.
    'colsample_bytree': [0.4, 0.6, 0.8, 1.0],  # Subsample ratio of columns when constructing each tree.
    # 'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], # Minimum sum of instance weight(hessian) needed in a child
    'gamma': [0, 0.25, 0.5, 1.0] # Minimum loss reduction required to make a further partition
    # more parameters can be added here as per requirement
}

xgbmodel = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss') # use_label_encoder and eval_metric parameters are included to avoid warnings

start_time = time.time()
grid_search = GridSearchCV(estimator=xgbmodel, param_grid=param_grid, scoring='f1_macro', n_jobs=-1, cv=5)
grid_search.fit(xtrain, ytrain.values.ravel())
end_time = time.time()

results_df = pd.DataFrame(grid_search.cv_results_)
sorted_results_df = results_df.sort_values(by='rank_test_score')

important_columns = ['rank_test_score', 'mean_test_score', 'std_test_score', 'params']
display(sorted_results_df[important_columns])

print("Time elapsed: ", end_time-start_time, " seconds.")



Unnamed: 0,rank_test_score,mean_test_score,std_test_score,params
39,1,0.823990,0.001959,"{'colsample_bytree': 0.4, 'gamma': 0, 'learnin..."
43,2,0.823567,0.001773,"{'colsample_bytree': 0.4, 'gamma': 0, 'learnin..."
47,3,0.823349,0.002077,"{'colsample_bytree': 0.4, 'gamma': 0, 'learnin..."
31,4,0.822687,0.001675,"{'colsample_bytree': 0.4, 'gamma': 0, 'learnin..."
27,5,0.822512,0.001797,"{'colsample_bytree': 0.4, 'gamma': 0, 'learnin..."
...,...,...,...,...
912,1532,0.774391,0.003335,"{'colsample_bytree': 0.8, 'gamma': 0.25, 'lear..."
1104,1533,0.773612,0.001213,"{'colsample_bytree': 0.8, 'gamma': 1.0, 'learn..."
1296,1534,0.772802,0.002021,"{'colsample_bytree': 1.0, 'gamma': 0.25, 'lear..."
1392,1535,0.771272,0.001852,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn..."


Time elapsed:  5654.710032939911  seconds.


In [8]:
sorted_results_df.to_csv('Xgb-params.csv', index=False)

In [17]:


param_grid = {
    'n_estimators': [10, 50, 100, 500],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False],  # Whether bootstrap samples are used
    'max_features': ['sqrt']  # The number of features to consider when looking for the best split
}
# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV with the RandomForestClassifier and parameter grid
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='f1_macro', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(xtrain, ytrain.values.ravel())

# Convert the cv_results_ dictionary to a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Sort the results by the 'rank_test_score' to see the best scores at the top
sorted_results_df = results_df.sort_values(by='rank_test_score')

# Display the sorted results, focusing on important columns
important_columns = ['rank_test_score', 'mean_test_score', 'std_test_score', 'params']
display(sorted_results_df[important_columns])



Unnamed: 0,rank_test_score,mean_test_score,std_test_score,params
51,1,0.793151,0.000987,"{'bootstrap': False, 'max_depth': None, 'max_f..."
55,2,0.789523,0.000917,"{'bootstrap': False, 'max_depth': None, 'max_f..."
59,3,0.788947,0.00166,"{'bootstrap': False, 'max_depth': None, 'max_f..."
63,4,0.788851,0.001271,"{'bootstrap': False, 'max_depth': None, 'max_f..."
50,5,0.786865,0.001605,"{'bootstrap': False, 'max_depth': None, 'max_f..."


Unnamed: 0,rank_test_score,mean_test_score,std_test_score,params
51,1,0.793151,0.000987,"{'bootstrap': False, 'max_depth': None, 'max_f..."
55,2,0.789523,0.000917,"{'bootstrap': False, 'max_depth': None, 'max_f..."
59,3,0.788947,0.001660,"{'bootstrap': False, 'max_depth': None, 'max_f..."
63,4,0.788851,0.001271,"{'bootstrap': False, 'max_depth': None, 'max_f..."
50,5,0.786865,0.001605,"{'bootstrap': False, 'max_depth': None, 'max_f..."
...,...,...,...,...
23,92,0.636253,0.004326,"{'bootstrap': True, 'max_depth': 10, 'max_feat..."
18,93,0.636171,0.004473,"{'bootstrap': True, 'max_depth': 10, 'max_feat..."
26,94,0.636100,0.004570,"{'bootstrap': True, 'max_depth': 10, 'max_feat..."
17,95,0.635890,0.003973,"{'bootstrap': True, 'max_depth': 10, 'max_feat..."
