In [180]:
import pandas as pd
import numpy as np

from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

In [181]:
dataset = pd.read_csv('data/dataset_5818_artscore.csv')

In [182]:
to_encode=['year', 'key', 'mode']
to_bin=['duration_ms']
num_cols = ['time_signature', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence', 'tempo', 'artist_score']

In [183]:
transformer = ColumnTransformer([('scaler', StandardScaler(), num_cols),
                                 ("discretizer", KBinsDiscretizer(), to_bin),
                                 ("encoder", OneHotEncoder(drop='first'), to_encode)],
                                n_jobs=-1, verbose=1)

In [184]:
data_new = dataset.drop(['name', 'album', 'artist', 'popularity', 'release_date'], axis=1)

In [185]:
X = data_new.drop('target', axis=1)
y = data_new.target

In [186]:
round(len(data_new)/len(data_new[data_new.target==1]))

12

## Creation of 12 clusters of non-hits songs

In [187]:
X_no_hit = X[y==0]

In [188]:
clustering = KMeans(n_clusters=12, max_iter=1000)

In [189]:
X_no_hit_tr = transformer.fit_transform(X_no_hit).toarray()

In [190]:
clustering.fit(X_no_hit_tr)

KMeans(max_iter=1000, n_clusters=12)

In [191]:
X_clustered = clustering.predict(X_no_hit_tr)

In [192]:
X_clustered

array([7, 9, 7, ..., 2, 0, 1], dtype=int32)

In [193]:
clusters_no_hits = [np.where(X_clustered==i)[0] for i in range(12)]

In [194]:
for cluster in clusters_no_hits:
    print(len(cluster))

16706
6452
19806
27917
5143
2259
12198
24660
25345
35655
13303
2014


In [195]:
X[y==1]

Unnamed: 0,duration_ms,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,year,artist_score
191458,153933,0,1,4,0.64800,0.474,0.338,0.000000,0.1300,-11.528,0.0299,0.810,154.596,1958,0.0
191459,281067,1,1,4,0.35300,0.569,0.495,0.000000,0.0839,-8.964,0.0300,0.302,128.228,1995,6.0
191460,250107,4,1,4,0.97700,0.450,0.311,0.007130,0.1670,-8.175,0.0314,0.367,125.648,1997,45.0
191461,248693,1,1,4,0.02600,0.463,0.524,0.000000,0.1020,-8.207,0.0316,0.308,144.055,2006,0.0
191462,213173,5,0,4,0.04520,0.697,0.709,0.000000,0.1380,-1.917,0.0455,0.787,134.910,2009,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208821,334040,5,1,4,0.00612,0.424,0.906,0.001860,0.9880,-10.985,0.0610,0.676,168.447,1988,7.0
208822,221227,5,1,4,0.05090,0.676,0.948,0.000008,0.3840,-2.557,0.0628,0.731,136.027,2004,0.0
208823,230547,4,1,4,0.17600,0.633,0.824,0.000000,0.1150,-6.015,0.0753,0.777,95.893,2000,1.0
208824,147987,1,1,4,0.59200,0.661,0.778,0.000000,0.0897,-7.352,0.0318,0.968,169.910,2005,0.0


In [196]:
X_no_hit.iloc[clusters_no_hits[0],:]

Unnamed: 0,duration_ms,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,year,artist_score
14,295467,9,1,4,0.978,0.2580,0.00795,0.924,0.1200,-33.708,0.0398,0.0460,80.806,2017,1.0
20,462880,0,1,4,0.963,0.1310,0.03780,0.738,0.0825,-25.268,0.0481,0.0792,64.613,1992,0.0
24,88960,5,0,5,0.744,0.2190,0.42200,0.884,0.2640,-12.547,0.0406,0.1630,124.337,1982,0.0
34,449976,5,0,3,0.961,0.2280,0.06010,0.829,0.1100,-28.513,0.0424,0.0502,77.082,1973,0.0
46,104133,9,0,3,0.970,0.1700,0.00805,0.302,0.0930,-39.333,0.0463,0.0539,126.364,1998,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191370,80240,0,1,4,0.951,0.0864,0.19900,0.960,0.3510,-13.474,0.0368,0.0304,166.757,1995,0.0
191373,110373,4,1,4,0.969,0.4540,0.23000,0.757,0.0868,-18.032,0.0308,0.2630,112.370,1959,0.0
191390,150720,2,1,4,0.973,0.2090,0.23600,0.983,0.1170,-13.703,0.0429,0.1500,136.512,2009,0.0
191393,349093,5,1,4,0.738,0.0758,0.28600,0.598,0.1160,-15.973,0.0411,0.0357,80.944,1984,0.0


In [197]:
X_cluster_0 = pd.concat((X_no_hit.iloc[clusters_no_hits[0],:], X[y==1]), axis=0, ignore_index=True)

In [198]:
X_cluster_0

Unnamed: 0,duration_ms,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,year,artist_score
0,295467,9,1,4,0.97800,0.258,0.00795,0.924000,0.1200,-33.708,0.0398,0.0460,80.806,2017,1.0
1,462880,0,1,4,0.96300,0.131,0.03780,0.738000,0.0825,-25.268,0.0481,0.0792,64.613,1992,0.0
2,88960,5,0,5,0.74400,0.219,0.42200,0.884000,0.2640,-12.547,0.0406,0.1630,124.337,1982,0.0
3,449976,5,0,3,0.96100,0.228,0.06010,0.829000,0.1100,-28.513,0.0424,0.0502,77.082,1973,0.0
4,104133,9,0,3,0.97000,0.170,0.00805,0.302000,0.0930,-39.333,0.0463,0.0539,126.364,1998,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34069,334040,5,1,4,0.00612,0.424,0.90600,0.001860,0.9880,-10.985,0.0610,0.6760,168.447,1988,7.0
34070,221227,5,1,4,0.05090,0.676,0.94800,0.000008,0.3840,-2.557,0.0628,0.7310,136.027,2004,0.0
34071,230547,4,1,4,0.17600,0.633,0.82400,0.000000,0.1150,-6.015,0.0753,0.7770,95.893,2000,1.0
34072,147987,1,1,4,0.59200,0.661,0.77800,0.000000,0.0897,-7.352,0.0318,0.9680,169.910,2005,0.0


In [199]:
y_cluster_0 = np.concatenate((np.zeros((len(X_cluster_0)-len(X[y==1]),)), np.ones((len(X[y==1]),))), axis=0)

In [200]:
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_cluster_0, y_cluster_0, test_size=.2, stratify=y_cluster_0, random_state=42)

In [201]:
pipe_rf = Pipeline([('transformer', transformer),
                   ('estimator', RandomForestClassifier())])

In [202]:
pipe_rf.fit(X_train_0, y_train_0)

Pipeline(steps=[('transformer',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('scaler', StandardScaler(),
                                                  ['time_signature',
                                                   'acousticness',
                                                   'danceability', 'energy',
                                                   'instrumentalness',
                                                   'liveness', 'loudness',
                                                   'speechiness', 'valence',
                                                   'tempo', 'artist_score']),
                                                 ('discretizer',
                                                  KBinsDiscretizer(),
                                                  ['duration_ms']),
                                                 ('encoder',
                                                  OneHotEncoder(dr

In [203]:
y_pred = pipe_rf.predict(X_test_0)

In [204]:
y_train_pred = pipe_rf.predict(X_train_0)

In [205]:
print(classification_report(y_test_0, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      3341
         1.0       1.00      0.98      0.99      3474

    accuracy                           0.99      6815
   macro avg       0.99      0.99      0.99      6815
weighted avg       0.99      0.99      0.99      6815



In [206]:
print(classification_report(y_train_0, y_train_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     13365
         1.0       1.00      1.00      1.00     13894

    accuracy                           1.00     27259
   macro avg       1.00      1.00      1.00     27259
weighted avg       1.00      1.00      1.00     27259



In [207]:
models = {}
i=0
for cluster in clusters_no_hits:
    X_cluster = pd.concat((X_no_hit.iloc[cluster,:], X[y==1]), axis=0, ignore_index=True)
    y_cluster = np.concatenate((np.zeros((len(X_cluster)-len(X[y==1]),)), np.ones((len(X[y==1]),))), axis=0)
    
    X_train, X_test, y_train, y_test = train_test_split(X_cluster, y_cluster, test_size=.2, stratify=y_cluster, random_state=42)
    
    pipe_rf = Pipeline([('transformer', transformer),
                   ('estimator', RandomForestClassifier())])
    
    pipe_rf.fit(X_train, y_train)
    
    models[f'model_cluster_{i}'] = pipe_rf
    
    i += 1

In [208]:
models

{'model_cluster_0': Pipeline(steps=[('transformer',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('scaler', StandardScaler(),
                                                   ['time_signature',
                                                    'acousticness',
                                                    'danceability', 'energy',
                                                    'instrumentalness',
                                                    'liveness', 'loudness',
                                                    'speechiness', 'valence',
                                                    'tempo', 'artist_score']),
                                                  ('discretizer',
                                                   KBinsDiscretizer(),
                                                   ['duration_ms']),
                                                  ('encoder',
                                 

In [209]:
X

Unnamed: 0,duration_ms,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,year,artist_score
0,187480,2,1,4,0.080300,0.750,0.783,0.000000,0.0786,-5.480,0.0720,0.274,157.057,2008,4.0
1,278800,8,1,4,0.094400,0.629,0.647,0.004810,0.2470,-9.260,0.0343,0.652,135.866,1977,0.0
2,357785,8,1,3,0.000315,0.420,0.731,0.025200,0.2210,-4.027,0.1030,0.293,126.692,2013,0.0
3,288640,11,1,4,0.954000,0.186,0.127,0.092700,0.1090,-18.247,0.0383,0.071,70.883,1995,2.0
4,204000,6,0,4,0.014100,0.810,0.618,0.035800,0.0970,-8.194,0.0651,0.776,120.021,2003,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208821,334040,5,1,4,0.006120,0.424,0.906,0.001860,0.9880,-10.985,0.0610,0.676,168.447,1988,7.0
208822,221227,5,1,4,0.050900,0.676,0.948,0.000008,0.3840,-2.557,0.0628,0.731,136.027,2004,0.0
208823,230547,4,1,4,0.176000,0.633,0.824,0.000000,0.1150,-6.015,0.0753,0.777,95.893,2000,1.0
208824,147987,1,1,4,0.592000,0.661,0.778,0.000000,0.0897,-7.352,0.0318,0.968,169.910,2005,0.0


In [210]:
models['model_cluster_0'].predict(X)

array([1., 1., 1., ..., 1., 1., 1.])

In [218]:
from sklearn.metrics import f1_score

In [232]:
def make_predictions(X, models=models, voting='hard'):
    scores = []
    if voting == 'hard':
        preds = np.zeros((len(X),))
        for name, model in models.items():
            preds += f1_score(y, model.predict(X))*model.predict(X)
            scores.append(f1_score(y, model.predict(X)))
        preds[preds <= np.mean(scores)*8] = 0
        preds[preds > np.mean(scores)*8] = 1
    return preds, scores

In [233]:
models['model_cluster_1'].predict_proba(X)

array([[0.01, 0.99],
       [0.05, 0.95],
       [0.04, 0.96],
       ...,
       [0.01, 0.99],
       [0.  , 1.  ],
       [0.01, 0.99]])

In [234]:
for cluster in clusters_no_hits:
    print(len(cluster))

16706
6452
19806
27917
5143
2259
12198
24660
25345
35655
13303
2014


In [235]:
y

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
208821    1.0
208822    1.0
208823    1.0
208824    1.0
208825    1.0
Name: target, Length: 208826, dtype: float64

In [236]:
y_pred, scores = make_predictions(X)

In [237]:
scores

[0.1919280269898788,
 0.15193935298201094,
 0.13487240068887973,
 0.14513106429965553,
 0.15419141383967347,
 0.16351169845110597,
 0.1549136698427193,
 0.13030083233391027,
 0.16358658453114305,
 0.09798164830660878,
 0.16399150117805453,
 0.15454663137267158]

In [238]:
y_pred

array([1., 0., 1., ..., 1., 1., 1.])

In [239]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.34      0.50    191458
         1.0       0.09      0.74      0.16     17368

    accuracy                           0.38    208826
   macro avg       0.51      0.54      0.33    208826
weighted avg       0.87      0.38      0.47    208826

