In [1]:
import pandas as pd
import json
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('final_data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147519 entries, 0 to 147518
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   pos          147519 non-null  int64  
 1   artist_name  147519 non-null  object 
 2   track_uri    147519 non-null  object 
 3   artist_uri   147519 non-null  object 
 4   track_name   147518 non-null  object 
 5   album_uri    147519 non-null  object 
 6   duration_ms  147519 non-null  int64  
 7   album_name   147519 non-null  object 
 8   Genre        147519 non-null  object 
 9   releaseYear  124588 non-null  float64
dtypes: float64(1), int64(2), object(7)
memory usage: 11.3+ MB


In [3]:
with open ('vetting_playlist.json', 'r') as file:
    vet = json.load(file)

<p>Dancability<br>
Year<br>
Genre<br>
Duration<br>
Popularity<br>
Liveliness<br>
Marked Explicit?</p>

In [4]:
df.head()

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,Genre,releaseYear
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,hip hop,2005.0
1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,pop,2003.0
2,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,reggae,2000.0
3,5,Usher,spotify:track:0XUfyU2QviPAs6bxSpXYG4,spotify:artist:23zg3TcAtWQy7J6upgbUnj,Yeah!,spotify:album:0vO0b1AvY49CPQyVisJLj0,250373,Confessions,r&b,2004.0
4,6,Usher,spotify:track:68vgtRHr7iZHpzGpon6Jlo,spotify:artist:23zg3TcAtWQy7J6upgbUnj,My Boo,spotify:album:1RM6MGv6bcl6NrAG8PGoZk,223440,Confessions,r&b,2004.0


In [5]:
genre = df['Genre']
genre.head(10)

0                      hip hop
1                          pop
2                       reggae
3                          r&b
4                          r&b
5    southern hip hop, hip hop
6                          r&b
7                          r&b
8               pop punk, punk
9                          r&b
Name: Genre, dtype: object

In [6]:
duration = df['duration_ms']
duration.head(10)

0    226863
1    198800
2    227600
3    250373
4    223440
5    235213
6    256426
7    229866
8    193042
9    229040
Name: duration_ms, dtype: int64

In [7]:
releaseYear = df['releaseYear']
releaseYear.head(10)

0    2005.0
1    2003.0
2    2000.0
3    2004.0
4    2004.0
5    2003.0
6    2006.0
7    2005.0
8    2011.0
9    2005.0
Name: releaseYear, dtype: float64

In [8]:
min_year = df.loc[df['releaseYear'] != 0.0, 'releaseYear'].min()
min_year

1885.0

In [9]:
min_duration = df.loc[df['duration_ms'] != 0.0, 'duration_ms'].min()
min_duration

151

In [10]:
max_year = df['releaseYear'].max()
max_year

2021.0

In [11]:
max_duration = df['duration_ms'].max()
max_duration

10435467

In [12]:
df_sample = df.sample(frac=1/50, random_state=42)
df_sample.reset_index(drop=True, inplace=True)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2950 entries, 0 to 2949
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pos          2950 non-null   int64  
 1   artist_name  2950 non-null   object 
 2   track_uri    2950 non-null   object 
 3   artist_uri   2950 non-null   object 
 4   track_name   2950 non-null   object 
 5   album_uri    2950 non-null   object 
 6   duration_ms  2950 non-null   int64  
 7   album_name   2950 non-null   object 
 8   Genre        2950 non-null   object 
 9   releaseYear  2527 non-null   float64
dtypes: float64(1), int64(2), object(7)
memory usage: 230.6+ KB


In [13]:
X = df[['pos', 'duration_ms']]
model = DBSCAN(eps=3, min_samples=5).fit(X)
model.labels_

array([-1, -1, -1, ..., -1, -1, -1], dtype=int64)

In [14]:
X2 = df[['duration_ms']]
model = DBSCAN(eps=3, min_samples=5).fit(X2)
model.labels_

array([    0,     1,     2, ...,    -1,  6149, 10531], dtype=int64)

In [15]:
model = DBSCAN(eps=3, min_samples=5).fit(X)

labels = model.labels_

mask = labels != -1
X_filtered = X[mask]
labels_filtered = labels[mask]

score = silhouette_score(X_filtered, labels_filtered)
score

0.9357721956468853

In [16]:
model2 = DBSCAN(eps=3, min_samples=5).fit(X2)

labels2 = model2.labels_

mask2 = labels2 != -1
X2_filtered = X2[mask]
labels2_filtered = labels2[mask]

score2 = silhouette_score(X2_filtered, labels2_filtered)
score2

0.9986992340676785

In [19]:
df_sampled = df.sample(n=10000, random_state=42)

In [20]:
encoder = OneHotEncoder(sparse=False)
encoded_genres = encoder.fit_transform(df_sampled[['Genre']])

encoded_genres_df = pd.DataFrame(encoded_genres, columns=encoder.categories_[0])

model = DBSCAN(eps=2, min_samples=5)
labels = model.fit_predict(encoded_genres_df)

df['cluster'] = labels

df

ValueError: Length of values (10000) does not match length of index (147519)

In [None]:
mask = labels != -1
if mask.any():
    print(f"Silhouette Score: {silhouette_score(encoded_genres_df[mask], labels[mask])}")
else:
    print("Silhouette Score: Not applicable due to noise points.")