In [1]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd


scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id='d210e8d7078843aba6bcdc2340493d17',
    client_secret='d10a80d4e2d14c289872418c019557e3',
    redirect_uri='http://localhost:8888/callback',
    scope=scope))
# results = sp.current_user_saved_tracks()
# for idx, item in enumerate(results['items']):
#     track = item['track']
#     print(idx, track['artists'][0]['name'], " – ", track['name'])

In [2]:
# Function to get all of your playlists
def get_my_playlists():
    playlists = []
    results = sp.current_user_playlists()
    while results:
        playlists.extend(results['items'])
        if results['next']:
            results = sp.next(results)
        else:
            results = None
    return playlists

def get_playlists_by_suffix(suffix):
    all_playlists = get_my_playlists()
    filtered_playlists = [playlist for playlist in all_playlists if playlist['name'].endswith(suffix)]
    return filtered_playlists

def get_playlist_tracks(playlist_id):
    tracks = []
    results = sp.playlist_tracks(playlist_id)
    tracks.extend(results['items'])
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

def get_audio_features(track_ids):
    audio_features = []
    # Spotify API can handle up to 100 track IDs at a time
    for i in range(0, len(track_ids), 100):
        batch = track_ids[i:i+100]
        audio_features.extend(sp.audio_features(batch))
    return audio_features

def create_tracks_dataframe(filtered_playlists, suffix):
    track_list = []
    for playlist in filtered_playlists:
        playlist_name = playlist['name'][:-len(suffix)].strip()  # Remove suffix
        tracks = get_playlist_tracks(playlist['id'])
        track_ids = [item['track']['id'] for item in tracks if item['track']]
        audio_features = get_audio_features(track_ids)
        for item, features in zip(tracks, audio_features):
            track = item['track']
            if track and features:  # Ensure track and features are not None
                track_details = {
                    'track_name': track['name'],
                    'track_id': track['id'],
                    'artist': track['artists'][0]['name'],
                    'album': track['album']['name'],
                    'label': playlist_name,
                    'danceability': features['danceability'],
                    'energy': features['energy'],
                    'key': features['key'],
                    'loudness': features['loudness'],
                    'mode': features['mode'],
                    'speechiness': features['speechiness'],
                    'acousticness': features['acousticness'],
                    'instrumentalness': features['instrumentalness'],
                    'liveness': features['liveness'],
                    'valence': features['valence'],
                    'tempo': features['tempo'],
                    'duration_ms': features['duration_ms'],
                    'time_signature': features['time_signature']
                }
                track_list.append(track_details)
    return pd.DataFrame(track_list)

# Example usage
suffix = "(2)"
filtered_playlists = get_playlists_by_suffix(suffix)

tracks_df = create_tracks_dataframe(filtered_playlists, suffix)


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Assuming tracks_df is the DataFrame created from the previous steps

# Select features and label
features = tracks_df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                      'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                      'duration_ms', 'time_signature']]
label = tracks_df['label']

# Split the data into train and test sets, ensuring each label is represented
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, stratify=label, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a RandomForestClassifier with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Predict on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


                  precision    recall  f1-score   support

         Bachata       0.60      0.75      0.67         4
   Boogie Woogie       0.30      0.43      0.35         7
       ChaChaCha       0.73      0.73      0.73        11
     Disco Samba       0.88      0.78      0.82         9
        DiscoFox       0.50      0.56      0.53         9
        Foxtrott       0.60      0.38      0.46         8
            Jive       0.25      0.12      0.17         8
Langsamer Walzer       0.90      0.90      0.90        10
       Quickstep       0.67      0.25      0.36         8
           Rumba       0.57      1.00      0.73         8
           Samba       0.67      0.57      0.62         7
         Slowfox       0.50      0.71      0.59         7
           Tango       0.75      0.86      0.80         7
   Wiener Walzer       1.00      0.88      0.93         8

        accuracy                           0.64       111
       macro avg       0.64      0.64      0.62       111
    weighted