### Obtaining Data

In [232]:
import pandas as pd
import numpy as np

In [233]:
final = pd.read_csv(r'../../assets/final.csv')
metadata = pd.read_csv(r'../../assets/metadata.csv')

final = final.drop(final.columns[0], axis="columns")  # drop the first column of the index


In [234]:
# final.info()

In [235]:
# final.head(3)

In [236]:
# metadata.info()

In [237]:
# metadata.head()

### Model Selection - K Means Algorithm

In [238]:
from sklearn.utils import shuffle
import hdbscan

In [239]:
final = shuffle(final, random_state=100)

In [240]:
num_enlisted = 2000  # how many songs are enlisted in the user's playlist

# X is the audience's playlist  Recall: iloc is integer position-based
X = final.iloc[[i for i in range(0, num_enlisted)]]

# Y is the music reservoir which are going to be used for the recommendations
Y = final.iloc[[i for i in range(num_enlisted, final.shape[0])]]

In [241]:
X = shuffle(X, random_state=100)
Y = shuffle(Y, random_state=100)

X

Unnamed: 0,track_id,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,artist_discovery,artist_familiarity,artist_hotttnesss,song_currency,song_hotttnesss
3049,12291,0.008797,0.779822,0.384926,5.665748e-01,0.289584,0.070378,94.945,0.551748,0.449009,0.520527,0.555683,0.000000,0.000000
11180,57808,0.023627,0.339632,0.880221,2.400000e-09,0.321886,0.045908,143.630,0.405866,0.324605,0.282899,0.339108,0.000025,0.038550
5396,21234,0.078061,0.679032,0.839800,2.232774e-01,0.136008,0.046528,125.020,0.909851,0.135619,0.059477,0.141679,0.000000,0.000000
11591,69258,0.390283,0.448094,0.847303,1.910427e-01,0.108648,0.047836,143.663,0.871847,0.460173,0.485004,0.540969,0.000411,0.194578
12249,95910,0.137264,0.709408,0.935570,1.880000e-08,0.083439,0.259614,136.306,0.121195,0.365677,0.255264,0.382015,0.000231,0.097529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7320,31753,0.993398,0.572985,0.473439,8.171597e-01,0.100901,0.038754,84.044,0.355889,0.165592,0.047453,0.172991,0.000000,0.000000
12792,115385,0.095215,0.863822,0.804424,8.498397e-01,0.131887,0.040032,112.455,0.903950,0.366985,0.325918,0.383382,0.000000,0.000000
12865,117517,0.004280,0.558904,0.746316,7.734163e-02,0.082892,0.032707,117.995,0.295729,0.434409,0.457857,0.491494,0.000000,0.000000
8158,35546,0.242228,0.777979,0.229853,8.089146e-01,0.054251,0.034881,95.999,0.838002,0.355983,0.305406,0.371888,0.000000,0.000000


In [242]:
# X.head()

In [243]:
hdbscan_model = hdbscan.HDBSCAN(
    algorithm='best', alpha=1.0, prediction_data=True, approx_min_span_tree=True,
    gen_min_span_tree=False, leaf_size=40, metric='euclidean', min_cluster_size=5,
    min_samples=None
)

In [244]:
def fit(df, algo, flag=0):
    df = df.set_index('track_id')
    if flag:
        algo.fit(df)
    else:
        algo.partial_fit(df)
    df['label'] = algo.labels_
    return (df, algo)

In [245]:
def predict(t, Y):
    Y = Y.set_index('track_id')
    y_pred = hdbscan.approximate_predict(t[1], Y)[0]
    mode = pd.Series(y_pred).mode()
    return t[0][t[0]['label'] == mode.loc[0]]

In [246]:
# Y.head()

In [247]:
# t[0] is the new dataframe with the new row indexes and labels of the belonged cluster
# t[1] is the HDBSCAN model after the training

t = fit(X, hdbscan_model, 1)

In [248]:
# show the numbers of clusters in the model
print(t[1].labels_.max() + 1)

2


In [249]:
recommendations = predict(t, Y)

In [250]:
recommendations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1985 entries, 12291 to 4216
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   acousticness        1985 non-null   float64
 1   danceability        1985 non-null   float64
 2   energy              1985 non-null   float64
 3   instrumentalness    1985 non-null   float64
 4   liveness            1985 non-null   float64
 5   speechiness         1985 non-null   float64
 6   tempo               1985 non-null   float64
 7   valence             1985 non-null   float64
 8   artist_discovery    1985 non-null   float64
 9   artist_familiarity  1985 non-null   float64
 10  artist_hotttnesss   1985 non-null   float64
 11  song_currency       1985 non-null   float64
 12  song_hotttnesss     1985 non-null   float64
 13  label               1985 non-null   int64  
dtypes: float64(13), int64(1)
memory usage: 232.6 KB


In [251]:
recommendations.head()

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,artist_discovery,artist_familiarity,artist_hotttnesss,song_currency,song_hotttnesss,label
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
12291,0.008797,0.779822,0.384926,0.5665748,0.289584,0.070378,94.945,0.551748,0.449009,0.520527,0.555683,0.0,0.0,1
57808,0.023627,0.339632,0.880221,2.4e-09,0.321886,0.045908,143.63,0.405866,0.324605,0.282899,0.339108,2.5e-05,0.03855,1
21234,0.078061,0.679032,0.8398,0.2232774,0.136008,0.046528,125.02,0.909851,0.135619,0.059477,0.141679,0.0,0.0,1
69258,0.390283,0.448094,0.847303,0.1910427,0.108648,0.047836,143.663,0.871847,0.460173,0.485004,0.540969,0.000411,0.194578,1
95910,0.137264,0.709408,0.93557,1.88e-08,0.083439,0.259614,136.306,0.121195,0.365677,0.255264,0.382015,0.000231,0.097529,1


In [252]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13129 entries, 0 to 13128
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   track_id     13129 non-null  int64 
 1   album_title  13129 non-null  object
 2   artist_name  13129 non-null  object
 3   genre        13129 non-null  object
 4   track_title  13128 non-null  object
dtypes: int64(1), object(4)
memory usage: 513.0+ KB


In [253]:
metadata = metadata.set_index('track_id')

In [254]:
#metadata = metadata.set_index('track_id')

In [255]:
Y.head()

Unnamed: 0,track_id,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,artist_discovery,artist_familiarity,artist_hotttnesss,song_currency,song_hotttnesss
3979,15866,0.12662,0.444495,0.687881,0.460171,0.092639,0.374912,189.879,0.35337,0.442439,0.357309,0.462207,0.0,0.003068
10634,49464,0.007757,0.647147,0.836007,0.880194,0.192954,0.029688,99.95,0.387107,0.323548,0.387238,0.338004,0.0,0.0
354,885,0.0699,0.385746,0.707775,0.937884,0.203171,0.105063,123.453,0.039475,0.405436,0.36921,0.423551,0.0,0.0
8874,39685,0.000707,0.849537,0.540725,0.934993,0.098529,0.088324,121.976,0.684539,0.444844,0.328891,0.46472,0.0,0.0
6630,28348,0.276979,0.452824,0.84264,0.135532,0.09739,0.416882,177.627,0.969982,0.262087,0.148808,0.273797,0.0,0.0


In [256]:
Y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11129 entries, 3979 to 5644
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   track_id            11129 non-null  int64  
 1   acousticness        11129 non-null  float64
 2   danceability        11129 non-null  float64
 3   energy              11129 non-null  float64
 4   instrumentalness    11129 non-null  float64
 5   liveness            11129 non-null  float64
 6   speechiness         11129 non-null  float64
 7   tempo               11129 non-null  float64
 8   valence             11129 non-null  float64
 9   artist_discovery    11129 non-null  float64
 10  artist_familiarity  11129 non-null  float64
 11  artist_hotttnesss   11129 non-null  float64
 12  song_currency       11129 non-null  float64
 13  song_hotttnesss     11129 non-null  float64
dtypes: float64(13), int64(1)
memory usage: 1.3 MB


In [257]:
#Y = Y.reset_index(level=0)

In [258]:
#recommendations = recommendations.reset_index(level=0)

In [259]:
def recommend(recommendations, meta, Y):
    recommendations = recommendations.reset_index(level=0)
    Y = Y.reset_index(level=0)
    dat = []
    for i in Y['track_id']:
        dat.append(i)
    genre_mode = meta.loc[dat]['genre'].mode()
    artist_mode = meta.loc[dat]['artist_name'].mode()
    return meta[meta['genre'] == genre_mode.iloc[0]], meta[meta['artist_name'] == artist_mode.iloc[0]], meta.loc[
        recommendations['track_id']]

In [260]:
output = recommend(recommendations, metadata, Y)

In [261]:
genre_recommend, artist_name_recommend, mixed_recommend = output[0], output[1], output[2]

In [262]:
genre_recommend.shape

(3892, 4)

In [263]:
artist_name_recommend.shape

(94, 4)

In [264]:
mixed_recommend.shape

(1985, 4)

In [265]:
# Genre wise recommendations
genre_recommend.head()

Unnamed: 0_level_0,album_title,artist_name,genre,track_title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
153,Arc and Sender,Arc and Sender,Rock,Hundred-Year Flood
154,Arc and Sender,Arc and Sender,Rock,Squares And Circles
155,unreleased demo,Arc and Sender,Rock,Maps of the Stars Homes
169,Boss of Goth,Argumentix,Rock,Boss of Goth
170,Nightmarcher,Argumentix,Rock,Industry Standard Massacre


In [266]:
# Artist wise recommendations
artist_name_recommend.head()

Unnamed: 0_level_0,album_title,artist_name,genre,track_title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10381,Big Blood & The Bleedin' Hearts,Big Blood,Folk,Baron in the Trees
10382,Big Blood & The Bleedin' Hearts,Big Blood,Folk,New Dish Rag
10383,Big Blood & The Bleedin' Hearts,Big Blood,Folk,Graceless Lady
10384,Big Blood & The Bleedin' Hearts,Big Blood,Folk,Blood Mumble
10385,Big Blood & The Bleedin' Hearts,Big Blood,Folk,Curee


In [267]:
# Mixed Recommendations
mixed_recommend.head()

Unnamed: 0_level_0,album_title,artist_name,genre,track_title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12291,Wakka Chikka Wakka Chikka: Porn Music for the ...,Marcel,AvantGarde|International|Blues|,Strange Wedding 60908
57808,Live at WFMU on The Cherry Blossom Clinic with...,Happy Refugees,Rock,Inertia
21234,ccMixter,mykleanthony,Electronic,Youre My Everything (ft. Beckford)
69258,"Live at WFMU on the Cherry Blossom Clinic, Aug...",Sonny & The Sunsets,Rock,Void
95910,Genital Warfare,Suicidal Rap Orgy,HipHop,Murder Rap


In [268]:
recommendations

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,artist_discovery,artist_familiarity,artist_hotttnesss,song_currency,song_hotttnesss,label
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
12291,0.008797,0.779822,0.384926,5.665748e-01,0.289584,0.070378,94.945,0.551748,0.449009,0.520527,0.555683,0.000000,0.000000,1
57808,0.023627,0.339632,0.880221,2.400000e-09,0.321886,0.045908,143.630,0.405866,0.324605,0.282899,0.339108,0.000025,0.038550,1
21234,0.078061,0.679032,0.839800,2.232774e-01,0.136008,0.046528,125.020,0.909851,0.135619,0.059477,0.141679,0.000000,0.000000,1
69258,0.390283,0.448094,0.847303,1.910427e-01,0.108648,0.047836,143.663,0.871847,0.460173,0.485004,0.540969,0.000411,0.194578,1
95910,0.137264,0.709408,0.935570,1.880000e-08,0.083439,0.259614,136.306,0.121195,0.365677,0.255264,0.382015,0.000231,0.097529,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31753,0.993398,0.572985,0.473439,8.171597e-01,0.100901,0.038754,84.044,0.355889,0.165592,0.047453,0.172991,0.000000,0.000000,1
115385,0.095215,0.863822,0.804424,8.498397e-01,0.131887,0.040032,112.455,0.903950,0.366985,0.325918,0.383382,0.000000,0.000000,1
117517,0.004280,0.558904,0.746316,7.734163e-02,0.082892,0.032707,117.995,0.295729,0.434409,0.457857,0.491494,0.000000,0.000000,1
35546,0.242228,0.777979,0.229853,8.089146e-01,0.054251,0.034881,95.999,0.838002,0.355983,0.305406,0.371888,0.000000,0.000000,1


In [269]:
artist_name_recommend['artist_name'].value_counts()

Big Blood    94
Name: artist_name, dtype: int64

In [270]:
genre_recommend['genre'].value_counts()

Rock    3892
Name: genre, dtype: int64

In [271]:
genre_recommend['artist_name'].value_counts()

Glove Compartment               65
Blah Blah Blah                  62
Mors Ontologica                 50
Les Baudouins Morts             38
Kraus                           35
                                ..
Alone in 1982                    1
Ostrich Tuning                   1
The Dalai Lama Rama Fa Fa Fa     1
The Rusty Bells                  1
Lost Boy                         1
Name: artist_name, Length: 725, dtype: int64

#### Testing

In [272]:
testing = Y.iloc[6:12]['track_id']

In [273]:
testing

818        1834
5500      21963
9011      40422
6964      30000
5772      23318
12585    109073
Name: track_id, dtype: int64

In [274]:
ids = testing.loc[testing.index]

In [275]:
songs = metadata.loc[testing.loc[list(testing.index)]]

In [276]:
songs

Unnamed: 0_level_0,album_title,artist_name,genre,track_title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1834,The Glove Compartment,Glove Compartment,Rock,Mr. Feleppa...Goes to...
21963,You Sick Little Monkey,Psilodump,Electronic,Follow the Leaders
40422,gleanings and gatherings,Mt. Gigantic,AvantGarde|International|,Get Well Cougher
30000,This is the New Yeah!,Bleeptor V. Telescope,Electronic,We will dance to Tokyo waltz
23318,Accident Consultancy Live / Undead,THF Drenching,AvantGarde|International|,Santa Head Orchestration Revolt (Undead)
109073,Workers in Kontrol,Kraus,Rock,Killer


In [277]:
re = predict(t, Y.iloc[6:12])

In [278]:
output = recommend(re, metadata, Y.iloc[6:12])

In [279]:
ge_re, ge_ar, ge_mix = output[0], output[1], output[2]

In [280]:
ge_re.head()

Unnamed: 0_level_0,album_title,artist_name,genre,track_title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
236,Bersa Discos #8,Banana Clipz,AvantGarde|International|,"Push Am (Left, Right)"
461,blissblood.com,Cantonement Jazz Band,AvantGarde|International|,Bessemer
462,blissblood.com,Cantonement Jazz Band,AvantGarde|International|,Has Been Blues
463,blissblood.com,Cantonement Jazz Band,AvantGarde|International|,I'll Be Blue
464,blissblood.com,Cantonement Jazz Band,AvantGarde|International|,The Way I Feel Today


In [281]:
ge_ar.head(10)

Unnamed: 0_level_0,album_title,artist_name,genre,track_title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30000,This is the New Yeah!,Bleeptor V. Telescope,Electronic,We will dance to Tokyo waltz


In [282]:
ge_mix.head(10)

Unnamed: 0_level_0,album_title,artist_name,genre,track_title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12291,Wakka Chikka Wakka Chikka: Porn Music for the ...,Marcel,AvantGarde|International|Blues|,Strange Wedding 60908
57808,Live at WFMU on The Cherry Blossom Clinic with...,Happy Refugees,Rock,Inertia
21234,ccMixter,mykleanthony,Electronic,Youre My Everything (ft. Beckford)
69258,"Live at WFMU on the Cherry Blossom Clinic, Aug...",Sonny & The Sunsets,Rock,Void
95910,Genital Warfare,Suicidal Rap Orgy,HipHop,Murder Rap
23177,Exotica,Juanitos,AvantGarde|International|Blues|,Exotica
13128,Simulators can,Gorowski,Electronic,simulators can
50579,Something EP,Broke For Free,AvantGarde|International|,Something Old
30911,The Rain Book,Digi G'Alessio,AvantGarde|International|Blues|Jazz|,Don't Touch My Minestrone
27274,DC03,HCI,Jazz,Elpossible I


In [283]:
ge_re.shape

(1902, 4)

In [284]:
ge_ar.shape

(1, 4)

In [285]:
ge_mix.shape

(1985, 4)