In [1]:
import numpy as np
import pandas as pd
from numpy import array
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, multilabel_confusion_matrix, classification_report

In [6]:
df = pd.read_csv('SpotifyFeatures_new.csv',encoding='latin-1')


# Drop columns
dropped_columns = ['genre', 'artist_name', 'track_name','track_id', 'mode', 'duration_ms', 'tempo', 'energy', 'acousticness']
df.drop(columns=dropped_columns, inplace=True)

# Normalize data
X, y = df.drop(columns='popularity'), df['popularity']
X = (X-X.min())/(X.max()-X.min())

y = ((y-1)//20)+1

print(y)
print(X)

0         1
1         1
2         1
3         1
4         1
         ..
169679    5
169680    5
169681    5
169682    5
169683    5
Name: popularity, Length: 169684, dtype: int64
        danceability  instrumentalness  liveness  loudness  speechiness  \
0           0.541895          0.029930  0.095251  0.808082     0.024661   
1           0.503272          0.621622  0.369907  0.745076     0.302498   
2           0.268319          0.283283  0.509254  0.730983     0.014712   
3           0.477524          0.004755  0.113427  0.778349     0.004551   
4           0.423882          0.376376  0.097271  0.701589     0.016617   
...              ...               ...       ...       ...          ...   
169679      0.754318          0.000000  0.061222  0.834202     0.025826   
169680      0.729643          0.000002  0.048196  0.886194     0.039268   
169681      0.717841          0.000000  0.097271  0.839256     0.073561   
169682      0.832636          0.000002  0.092222  0.801730     0.164903

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=5)

def getdistribution(data):
    distribution = dict()
    for value in data:
        if value in distribution:
            distribution[value] += 1
        else:
            distribution[value] = 1
    return distribution
dist = getdistribution(y_train)
print(dist) # print frequency
for key in dist:
    dist[key] = 1;

sampler = RandomOverSampler(random_state=42)
X_samp, y_samp = sampler.fit_resample(X_train, y_train)

print(getdistribution(y_samp)) # print frequency after over sampling


mlp = MLPClassifier(solver = 'sgd',activation="logistic", hidden_layer_sizes=(16, 12, 8), learning_rate_init=0.2, max_iter=400, random_state=1)
nn_model = mlp.fit(X_samp, y_samp)


{2: 55273, 1: 20567, 3: 49241, 4: 10270, 5: 396}
{2: 55273, 1: 55273, 3: 55273, 4: 55273, 5: 55273}




In [8]:
import joblib
# save the model to disk
filename = 'finalized_model.joblib'
joblib.dump(nn_model, filename)

['finalized_model.joblib']

In [9]:
# Make prediction
y_pred = nn_model.predict(X_test)

# Accuracy
print("\nAccuracy: ")
print(accuracy_score(y_test, y_pred))

# MSE
print('\nMSE: ')
print(mean_squared_error(y_test,y_pred))

# The map of label and their precision and recall
print("\nPrecision and recall:" )
print(classification_report(y_test, y_pred))


print("Confusion matrix: ")
matrix = confusion_matrix(y_test, y_pred)
multi_matrix = multilabel_confusion_matrix(y_test, y_pred)
print(multi_matrix)


Accuracy: 
0.3278133011167752

MSE: 
1.8834015970769367

Precision and recall:
              precision    recall  f1-score   support

           1       0.45      0.48      0.46      5149
           2       0.52      0.49      0.51     13755
           3       0.48      0.06      0.11     12322
           4       0.12      0.40      0.19      2614
           5       0.01      0.51      0.02        97

    accuracy                           0.33     33937
   macro avg       0.32      0.39      0.26     33937
weighted avg       0.46      0.33      0.33     33937

Confusion matrix: 
[[[25722  3066]
  [ 2674  2475]]

 [[13974  6208]
  [ 6980  6775]]

 [[20787   828]
  [11549   773]]

 [[23723  7600]
  [ 1561  1053]]

 [[28730  5110]
  [   48    49]]]
