In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

In [3]:
tracks = pd.read_csv("cleaned_data.csv")
playlist = pd.read_csv("playlist.csv")

## Model training using all genres

In [5]:
track_data = tracks[~tracks['track_id'].isin(playlist['track_id'])].copy()

encoder = LabelEncoder()
track_data['genre'] = encoder.fit_transform(track_data['genre'])

features = track_data[['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                       'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                       'time_signature']]
# features = track_data[['energy', 'key', 'mode', 
#                        'acousticness', 'valence', 'tempo']]
target = track_data['genre']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.25810819127330914


In [56]:
def predict_top_k(knn, X, k=3, n_neighbors=50):
    neighbors = knn.kneighbors(X, n_neighbors=n_neighbors, return_distance=False)

    top_k_predictions = []
    for neighbor in neighbors:
        neighbor_labels = y_train.iloc[neighbor]
        top_k = neighbor_labels.value_counts().head(k).index.tolist()
        top_k_predictions.append(top_k)
    
    print("Top k predictions: ", top_k_predictions)

    return top_k_predictions

In [57]:
k_genres = 4
top_k_predictions = predict_top_k(knn, X_test_scaled, k=k_genres)

genre_list = []

correct_count = 0
for i, top_k in enumerate(top_k_predictions):
    genre_list.append(y_test.iloc[i])
    if y_test.iloc[i] in top_k:
        correct_count += 1
accuracy_top_k = correct_count / len(y_test)

print("Genre List: ", genre_list)

print(f"Top {k_genres} Accuracy: {accuracy_top_k}")

Top k predictions:  [[7, 0, 6, 8], [6, 5, 0, 3], [3, 4, 6, 5], [7, 8, 6, 2], [8, 2, 7, 6], [2, 0, 6, 8], [6, 2, 0, 8], [0, 7, 2, 8], [7, 8, 0, 1], [0, 3, 7, 6], [6, 4, 5], [0, 3, 7, 8], [2, 8, 7, 5], [7, 0, 8, 6], [0, 6, 7, 2], [8, 7, 6, 1], [3, 7, 8, 6], [3], [6, 7, 4, 2], [1, 8, 6, 2], [3], [3, 2, 6, 0], [6, 0, 3, 4], [7, 0, 8, 3], [6, 3, 4, 7], [4, 6, 7, 3], [3, 4, 7, 6], [8, 5, 1, 7], [6, 4, 3, 2], [6, 4, 2, 3], [7, 8, 6, 2], [6, 2, 3, 0], [4, 6, 5, 2], [3, 8, 6, 1], [3, 4, 7, 6], [1], [6, 2, 7, 0], [6, 0, 2, 3], [2, 7, 8, 0], [3, 7, 0, 6], [4, 6, 2, 7], [0, 6, 7, 2], [3, 7], [0, 7, 2, 6], [8, 2, 7, 1], [3, 8, 5, 7], [0, 7, 8, 6], [3, 7, 5, 8], [1, 5], [2, 6, 0, 5], [4, 6, 3, 8], [6, 2, 0, 8], [6, 2, 0, 3], [6, 0, 2, 8], [7, 3, 5, 2], [7, 8, 0, 1], [7, 8, 3, 2], [2, 3, 0, 7], [8, 7, 3], [1, 5, 3, 6], [8, 7, 2, 3], [3, 8, 7, 5], [6, 4, 8, 0], [2, 7, 8, 3], [3], [0, 6, 7, 2], [3, 7], [5, 6, 2, 0], [7, 3, 8, 1], [7, 2, 0, 6], [7, 0, 6, 8], [1, 5, 6], [4, 3, 6, 7], [1, 8, 5, 7], [1, 5,

## Model training using the top 10 most frequent genres

In [14]:
selected_genres = [
    'acoustic',
    # 'afrobeat',
    # 'alt-rock',
    # 'alternative',
    'ambient',
    # 'anime',
    # 'black-metal',
    # 'bluegrass',
    # 'blues',
    # 'brazil',
    # 'breakbeat',
    # 'british',
    # 'cantopop',
    # 'chicago-house',
    # 'children',
    # 'chill',
    # 'classical',
    'club',
    # 'comedy',
    # 'country',
    # 'dance',
    # 'dancehall',
    # 'death-metal',
    # 'deep-house',
    # 'detroit-techno',
    # 'disco',
    # 'disney',
    # 'drum-and-bass',
    # 'dub',
    # 'dubstep',
    # 'edm',
    # 'electro',
    # 'electronic',
    # 'emo',
    # 'folk',
    # 'forro',
    # 'french',
    # 'funk',
    # 'garage',
    # 'german',
    # 'gospel',
    # 'goth',
    # 'grindcore',
    # 'groove',
    # 'grunge',
    # 'guitar',
    # 'happy',
    # 'hard-rock',
    # 'hardcore',
    # 'hardstyle',
    # 'heavy-metal',
    'hip-hop',
    # 'honky-tonk',
    # 'house',
    # 'idm',
    # 'indian',
    # 'indie-pop',
    # 'indie',
    # 'industrial',
    # 'iranian',
    # 'j-dance',
    # 'j-idol',
    # 'j-pop',
    # 'j-rock',
    # 'jazz',
    # 'k-pop',
    # 'kids',
    # 'latin',
    # 'latino',
    # 'malay',
    # 'mandopop',
    'metal',
    # 'metalcore',
    # 'minimal-techno',
    # 'mpb',
    # 'new-age',
    # 'opera',
    # 'pagode',
    # 'party',
    # 'piano',
    # 'pop-film',
    # 'pop',
    # 'power-pop',
    # 'progressive-house',
    # 'psych-rock',
    # 'punk-rock',
    # 'punk',
    # 'r-n-b',
    # 'reggae',
    # 'reggaeton',
    # 'rock-n-roll',
    # 'rock',
    # 'rockabilly',
    # 'romance',
    # 'sad',
    # 'salsa',
    # 'samba',
    # 'sertanejo',
    # 'show-tunes',
    # 'singer-songwriter',
    # 'ska',
    # 'sleep',
    # 'soul',
    # 'spanish',
    # 'study',
    # 'swedish',
    # 'synth-pop',
    # 'tango',
    # 'techno',
    # 'trance',
    # 'trip-hop',
    # 'turkish',
    # 'world-music'
]

print(len(selected_genres))

5


In [15]:
filtered_tracks = tracks[tracks['genre'].isin(selected_genres)].copy()
filtered_track_data = filtered_tracks[~filtered_tracks['track_id'].isin(playlist['track_id'])].copy()

encoder_filter = LabelEncoder()
filtered_track_data['genre'] = encoder_filter.fit_transform(filtered_track_data['genre'])

features = filtered_track_data[['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                       'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                       'time_signature']]
# features = filtered_track_data[['energy', 'key', 'mode', 
#                        'acousticness', 'valence', 'tempo']]
target = filtered_track_data['genre']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn_filtered = KNeighborsClassifier(n_neighbors=5)
knn_filtered.fit(X_train_scaled, y_train)

y_pred = knn_filtered.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.722972972972973


In [41]:
k_genres = 3
top_k_predictions = predict_top_k(knn_filtered, X_test_scaled, k=k_genres, n_neighbors=10)

genre_list = []

correct_count = 0
for i, top_k in enumerate(top_k_predictions):
    genre_list.append(y_test.iloc[i])
    if y_test.iloc[i] in top_k:
        correct_count += 1
accuracy_top_k = correct_count / len(y_test)

print("Genre List: ", genre_list)

print(f"Top {k_genres} Accuracy: {accuracy_top_k}")

Neighbors:  [[ 3885  7719  1738 ... 21576 18816  8736]
 [ 7767    73 31614 ... 18962 33875  1422]
 [15421 21007  5496 ... 20323 21896 27647]
 ...
 [ 5871 20246 21928 ... 18681 24363  3406]
 [ 5781 31451  5224 ... 14937  9177 15055]
 [  704 27292 27760 ... 36040 22326  5124]]
Top k predictions:  [[62, 20, 29], [41, 14, 17], [31, 66, 4], [36, 5, 42], [15, 47, 3], [65, 34], [47, 13, 14], [47, 5, 29], [50, 28, 66], [4, 34], [0, 61, 6], [15, 8, 52], [31, 30, 19], [43, 24, 53], [1, 56, 46], [59, 62, 37], [9, 61, 52], [15, 17, 61], [68, 19, 13], [15, 20, 28], [30, 27], [37, 30, 35], [27, 30, 15], [15, 50, 13], [68, 8, 34], [59, 41, 38], [49, 29, 28], [19, 62, 20], [35, 37, 8], [46, 63, 2], [50, 41, 32], [20, 50, 31], [13, 6, 19], [42, 36, 53], [37, 31, 43], [47, 14, 30], [60, 7], [19, 13, 15], [47, 62, 14], [59, 43, 21], [43, 39, 0], [32, 10, 17], [60, 3, 36], [37, 35, 29], [67, 51, 57], [8, 45, 50], [10, 17, 42], [40, 3, 21], [53, 5, 11], [26, 66, 38], [37, 49, 26], [0, 46, 49], [67, 46], [2

In [25]:
playlist_features = playlist[['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                                     'time_signature']]
playlist_features_scaled = scaler.transform(playlist_features)

# Predict genre of playlist
playlist_genre_predictions = knn.predict(playlist_features_scaled)
predicted_genres = encoder.inverse_transform(playlist_genre_predictions)
playlist['predicted_genre'] = predicted_genres
print(playlist[['track_name', 'predicted_genre']])

# Predict genre of playlist with selected genres
playlist_genre_predictions_filtered = knn_filtered.predict(playlist_features_scaled)
predicted_genres_filtered = encoder_filter.inverse_transform(playlist_genre_predictions_filtered)
playlist['predicted_genre'] = predicted_genres_filtered
print(playlist[['track_name', 'predicted_genre']])


                                         track_name predicted_genre
0                                     Getting Older        alt-rock
1                                     Afternoon Tea          brazil
2                                         Tennessee           blues
3   Apokaliptyczny Młot (Obliteracja Poprzez Życie)        acoustic
4                                Au Bord Du Gouffre        acoustic
..                                              ...             ...
95                                colors of a night          brazil
96                                          roasted        afrobeat
97                                    frozen grapes          brazil
98                              I Wanna Be A Writer          brazil
99                   Open the Gates O Forest Keeper     black-metal

[100 rows x 2 columns]
                                         track_name predicted_genre
0                                     Getting Older          comedy
1                       

## Save the Model using `pickle`

In [1]:
import pickle

In [None]:
saved_model = pickle.dumps(knn_filtered)
knn_loaded = pickle.loads(saved_model)
y_pred = knn_loaded.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))