In [1]:
import pandas as pd
top50_df = pd.read_csv("top_50_2019_2021.csv", encoding='latin-1')
top50_df.head()

Unnamed: 0,musicid,trackname,artistname,bpm,energy,danceability,loudness,liveness,valence,acounticness,speechiness,popularity,songyear
0,1,drivers license,Olivia Rodrigo,144,0.431,0.561,-8.81,0.106,0.137,0.768,0.0578,92,2021
1,1,SeÑorita,Shawn Mendes,117,55.0,76.0,-6.0,8.0,75.0,4.0,3.0,79,2019
2,2,MONTERO (Call Me By Your Name),Lil Nas X,179,0.503,0.593,-6.725,0.405,0.71,0.293,0.22,90,2021
3,2,China,Anuel AA,105,81.0,79.0,-4.0,8.0,61.0,8.0,9.0,92,2019
4,3,boyfriend (with Social House),Ariana Grande,190,80.0,40.0,-4.0,16.0,70.0,12.0,46.0,85,2019


In [2]:
import numpy as np
from pathlib import Path
from collections import Counter

In [3]:
#Remove uneccessary columns
top50_df.drop(['musicid', 'trackname', 'artistname'],  inplace=True, axis=1)
top50_df.head()

Unnamed: 0,bpm,energy,danceability,loudness,liveness,valence,acounticness,speechiness,popularity,songyear
0,144,0.431,0.561,-8.81,0.106,0.137,0.768,0.0578,92,2021
1,117,55.0,76.0,-6.0,8.0,75.0,4.0,3.0,79,2019
2,179,0.503,0.593,-6.725,0.405,0.71,0.293,0.22,90,2021
3,105,81.0,79.0,-4.0,8.0,61.0,8.0,9.0,92,2019
4,190,80.0,40.0,-4.0,16.0,70.0,12.0,46.0,85,2019


In [4]:
#Seperating Popular songs to >92 (Y) and <=91 (N (less popular))    
top50_df['Pop Y/N'] = pd.Series(np.where(top50_df.popularity.values <= 91, 0, 1), top50_df.index)
top50_df.head()

Unnamed: 0,bpm,energy,danceability,loudness,liveness,valence,acounticness,speechiness,popularity,songyear,Pop Y/N
0,144,0.431,0.561,-8.81,0.106,0.137,0.768,0.0578,92,2021,1
1,117,55.0,76.0,-6.0,8.0,75.0,4.0,3.0,79,2019,0
2,179,0.503,0.593,-6.725,0.405,0.71,0.293,0.22,90,2021,0
3,105,81.0,79.0,-4.0,8.0,61.0,8.0,9.0,92,2019,1
4,190,80.0,40.0,-4.0,16.0,70.0,12.0,46.0,85,2019,0


In [5]:
#Drop 'Popularity' Column
top50_df.drop("popularity", axis=1, inplace=True)
top50_df.head()

Unnamed: 0,bpm,energy,danceability,loudness,liveness,valence,acounticness,speechiness,songyear,Pop Y/N
0,144,0.431,0.561,-8.81,0.106,0.137,0.768,0.0578,2021,1
1,117,55.0,76.0,-6.0,8.0,75.0,4.0,3.0,2019,0
2,179,0.503,0.593,-6.725,0.405,0.71,0.293,0.22,2021,0
3,105,81.0,79.0,-4.0,8.0,61.0,8.0,9.0,2019,1
4,190,80.0,40.0,-4.0,16.0,70.0,12.0,46.0,2019,0


In [6]:
X = top50_df.drop("Pop Y/N", axis=1)

y = top50_df["Pop Y/N"]

In [7]:
#balance of 'Pop Y/N' values
y.value_counts()

0    82
1    18
Name: Pop Y/N, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Naive Random Oversampling

In [10]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state = 1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

#View the Count
Counter(y_resampled)


Counter({0: 61, 1: 61})

In [11]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="lbfgs", random_state =1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [12]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)

from sklearn.metrics import balanced_accuracy_score
print(balanced_accuracy_score(y_test, y_pred))

0.4583333333333333


In [13]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[14,  7],
       [ 3,  1]], dtype=int64)

In [14]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.67      0.25      0.74      0.41      0.17        21
          1       0.12      0.25      0.67      0.17      0.41      0.16         4

avg / total       0.71      0.60      0.32      0.65      0.41      0.17        25



# SMOTE Oversampling

In [15]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 61, 1: 61})

In [16]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [17]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.43452380952380953

In [18]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[13,  8],
       [ 3,  1]], dtype=int64)

In [19]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      0.62      0.25      0.70      0.39      0.16        21
          1       0.11      0.25      0.62      0.15      0.39      0.15         4

avg / total       0.70      0.56      0.31      0.61      0.39      0.16        25



# Undersampling

In [20]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 14, 1: 14})

In [21]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [22]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.41666666666666663

In [23]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 7, 14],
       [ 2,  2]], dtype=int64)

In [24]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      0.33      0.50      0.47      0.41      0.16        21
          1       0.12      0.50      0.33      0.20      0.41      0.17         4

avg / total       0.67      0.36      0.47      0.42      0.41      0.16        25



# Combination (Over and Under) Sampling

In [26]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 27, 1: 37})

In [27]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [28]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.3630952380952381

In [29]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[10, 11],
       [ 3,  1]], dtype=int64)

In [30]:
 #Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.77      0.48      0.25      0.59      0.35      0.12        21
          1       0.08      0.25      0.48      0.12      0.35      0.12         4

avg / total       0.66      0.44      0.29      0.51      0.35      0.12        25



# Balanced Random Forest Classifier

In [31]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
rf_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [32]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.4107142857142857


In [33]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[12,  9],
       [ 3,  1]], dtype=int64)

In [34]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.57      0.25      0.67      0.38      0.15        21
          1       0.10      0.25      0.57      0.14      0.38      0.14         4

avg / total       0.69      0.52      0.30      0.58      0.38      0.15        25



In [35]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(rf_model.feature_importances_, feature_names), reverse=True)

[(0.2416630055831243, 'bpm'),
 (0.15128804662826834, 'valence'),
 (0.14811395894857954, 'danceability'),
 (0.10789869892730555, 'liveness'),
 (0.0960119081294367, 'acounticness'),
 (0.09257231977073532, 'loudness'),
 (0.08199070329475448, 'energy'),
 (0.07781205316224014, 'speechiness'),
 (0.0026493055555555562, 'songyear')]

# Easy Ensemble AdaBoost Classifier

In [36]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators = 100, random_state = 1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [37]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.43452380952380953


In [38]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[13,  8],
       [ 3,  1]], dtype=int64)

In [39]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      0.62      0.25      0.70      0.39      0.16        21
          1       0.11      0.25      0.62      0.15      0.39      0.15         4

avg / total       0.70      0.56      0.31      0.61      0.39      0.16        25

