In [1]:
import numpy as np
import pandas as pd
from numpy import array
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, multilabel_confusion_matrix, classification_report

In [2]:
df = pd.read_csv("SpotifyWithDate.csv")
# Normalize data
X, y = df.drop(columns=['popularity', 'track_id']), df['popularity']
X = (X-X.min())/(X.max()-X.min())
y = ((y-1)//20)+1

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=5)

rf_cla = RandomForestClassifier(n_estimators=1500,random_state=1)

In [4]:
rf_cla.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1500, random_state=1)

In [5]:
# Make prediction
y_pred = rf_cla.predict(X_test)

# Accuracy
print("\nAccuracy: ")
print(accuracy_score(y_test, y_pred))

# MSE
print('\nMSE: ')
print(mean_squared_error(y_test,y_pred))

# The map of label and their precision and recall
print("\nPrecision and recall:" )
print(classification_report(y_test, y_pred))


print("Confusion matrix: ")
matrix = confusion_matrix(y_test, y_pred)
multi_matrix = multilabel_confusion_matrix(y_test, y_pred)
print(multi_matrix)


Accuracy: 
0.4934199837530463

MSE: 
0.7262388302193339

Precision and recall:
              precision    recall  f1-score   support

           1       0.63      0.37      0.47     10255
           2       0.48      0.68      0.57     17745
           3       0.43      0.41      0.42     12089
           4       0.75      0.12      0.21      2840
           5       0.81      0.46      0.58       156

    accuracy                           0.49     43085
   macro avg       0.62      0.41      0.45     43085
weighted avg       0.52      0.49      0.48     43085

Confusion matrix: 
[[[30580  2250]
  [ 6435  3820]]

 [[12346 12994]
  [ 5625 12120]]

 [[24549  6447]
  [ 7190  4899]]

 [[40127   118]
  [ 2491   349]]

 [[42912    17]
  [   85    71]]]


In [6]:
def getdistribution(data):
    distribution = dict()
    for value in data:
        if value in distribution:
            distribution[value] += 1
        else:
            distribution[value] = 1
    return distribution

dist = getdistribution(y)
print(f'Popularity distrubtion {dist}') # print frequency

Popularity distrubtion {1: 34209, 2: 59120, 3: 40131, 4: 9622, 5: 534}


# Random Forest With OverSampling

In [10]:
# Another way to find distribution
print(y.value_counts())

2    59120
3    40131
1    34209
4     9622
5      534
Name: popularity, dtype: int64


In [12]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_os, y_os = ros.fit_resample(X, y)
print(y_os.value_counts())

1    59120
2    59120
3    59120
4    59120
5    59120
Name: popularity, dtype: int64


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_os, y_os, test_size = 0.30, random_state=5)

rf_cla = RandomForestClassifier(n_estimators=1500,random_state=1)
rf_cla.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1500, random_state=1)

In [15]:
# Make prediction
y_pred = rf_cla.predict(X_test)

# Accuracy
print("\nAccuracy: ")
print(accuracy_score(y_test, y_pred))

# MSE
print('\nMSE: ')
print(mean_squared_error(y_test,y_pred))

# The map of label and their precision and recall
print("\nPrecision and recall:" )
print(classification_report(y_test, y_pred))


print("Confusion matrix: ")
matrix = confusion_matrix(y_test, y_pred)
multi_matrix = multilabel_confusion_matrix(y_test, y_pred)
print(multi_matrix)


Accuracy: 
0.8020184934596302

MSE: 
0.28344609833107803

Precision and recall:
              precision    recall  f1-score   support

           1       0.80      0.77      0.78     17676
           2       0.59      0.52      0.55     17676
           3       0.65      0.73      0.69     17645
           4       0.96      0.99      0.98     17756
           5       1.00      1.00      1.00     17927

    accuracy                           0.80     88680
   macro avg       0.80      0.80      0.80     88680
weighted avg       0.80      0.80      0.80     88680

Confusion matrix: 
[[[67546  3458]
  [ 4103 13573]]

 [[64584  6420]
  [ 8569  9107]]

 [[64010  7025]
  [ 4713 12932]]

 [[70281   643]
  [  172 17584]]

 [[70742    11]
  [    0 17927]]]
