In [2]:
import numpy as np
import pandas as pd
from numpy import array
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, multilabel_confusion_matrix, classification_report

In [3]:
df1 = pd.read_csv('SpotifyFeatures.csv',encoding='latin-1')

# Drop columns
dropped_columns = ['genre', 'artist_name', 'track_name','track_id', 'mode', 'duration_ms', 'tempo', 'energy', 'acousticness']
df1.drop(columns=dropped_columns, inplace=True)

df2 = pd.read_csv('spotify_top200_new.csv',encoding='latin-1')

# Drop columns
dropped_columns = ['Index','Highest Charting Position','Number of Times Charted','Week of Highest Charting','Song Name','Streams', 'Artist', 'Artist Followers','Song ID', 'Genre', 'Release Date', 'duration','tempo', 'Chord', 'energy','acousticness']
df2.drop(columns=dropped_columns, inplace=True)

df = pd.concat([df1,df2])

# Normalize data
X, y = df.drop(columns='popularity'), df['popularity']
X = (X-X.min())/(X.max()-X.min())

y = ((y-1)//20)+1

print(y)
print(X)

0       1
1       1
2       1
3       1
4       1
       ..
1495    2
1496    2
1497    2
1498    2
1499    2
Name: popularity, Length: 171184, dtype: int64
      danceability  instrumentalness  liveness  loudness  speechiness  \
0         0.541895          0.029930  0.095251  0.808082     0.024661   
1         0.503272          0.621622  0.369907  0.745076     0.302498   
2         0.268319          0.283283  0.509254  0.730983     0.014712   
3         0.477524          0.004755  0.113427  0.778349     0.004551   
4         0.423882          0.376376  0.097271  0.701589     0.016617   
...            ...               ...       ...       ...          ...   
1495      0.590173          0.000000  0.066978  0.677070     0.019263   
1496      0.488252          0.000000  0.088890  0.699952     0.029213   
1497      0.717841          0.000124  0.103329  0.810608     0.079911   
1498      0.586954          0.000040  0.093232  0.814594     0.010267   
1499      0.875550          0.387387  0.

In [4]:
def getdistribution(data):
    distribution = dict()
    for value in data:
        if value in distribution:
            distribution[value] += 1
        else:
            distribution[value] = 1
    return distribution

dist = getdistribution(y)
print(dist) # print frequency

df1['popularity'] = ((df1['popularity']-1)//20)+1
dist = getdistribution( df1['popularity'])
print(dist) # print frequency

df2['popularity'] = ((df2['popularity']-1)//20)+1
dist = getdistribution( df2['popularity'])
print(dist) # print frequency

{1: 25716, 2: 69033, 3: 61759, 4: 13849, 5: 827}
{1: 25716, 2: 69028, 3: 61563, 4: 12884, 5: 493}
{5: 334, 4: 965, 3: 196, 2: 5}


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=5)

rf_cla = RandomForestClassifier(n_estimators=1500,random_state=1)
rf_cla.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1500, random_state=1)

In [6]:
# Make prediction
y_pred = rf_cla.predict(X_test)

# Accuracy
print("\nAccuracy: ")
print(accuracy_score(y_test, y_pred))

# MSE
print('\nMSE: ')
print(mean_squared_error(y_test,y_pred))

# The map of label and their precision and recall
print("\nPrecision and recall:" )
print(classification_report(y_test, y_pred))


print("Confusion matrix: ")
matrix = confusion_matrix(y_test, y_pred)
multi_matrix = multilabel_confusion_matrix(y_test, y_pred)
print(multi_matrix)


Accuracy: 
0.5107290287405561

MSE: 
0.6795700599735182

Precision and recall:
              precision    recall  f1-score   support

           1       0.60      0.35      0.44      7716
           2       0.51      0.59      0.55     20602
           3       0.49      0.61      0.55     18655
           4       0.33      0.03      0.05      4146
           5       0.66      0.11      0.18       237

    accuracy                           0.51     51356
   macro avg       0.52      0.34      0.35     51356
weighted avg       0.50      0.51      0.49     51356

Confusion matrix: 
[[[41821  1819]
  [ 5018  2698]]

 [[19267 11487]
  [ 8546 12056]]

 [[21136 11565]
  [ 7323 11332]]

 [[46967   243]
  [ 4028   118]]

 [[51106    13]
  [  212    25]]]


In [9]:
import joblib
# save the model to disk
filename = 'random_forest_model500.joblib'
joblib.dump(rf_cla, filename)

['random_forest_model500.joblib']