In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

In [12]:
tracks = pd.read_csv("./data/cleaned_data.csv")
playlist = pd.read_csv("./data/playlist.csv")

## KNN with All Genres

In [13]:
track_data = tracks[~tracks['track_id'].isin(playlist['track_id'])].copy()

encoder = LabelEncoder()
track_data['genre'] = encoder.fit_transform(track_data['genre'])

features = track_data[['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                       'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                       'time_signature']]
target = track_data['genre']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.536370330566999


In [14]:
def predict_top_k(knn, X, k=3, n_neighbors=50):
    neighbors = knn.kneighbors(X, n_neighbors=n_neighbors, return_distance=False)

    top_k_predictions = []
    for neighbor in neighbors:
        neighbor_labels = y_train.iloc[neighbor]
        top_k = neighbor_labels.value_counts().head(k).index.tolist()
        top_k_predictions.append(top_k)
    
    print("Top k predictions: ", top_k_predictions)

    return top_k_predictions

In [15]:
k_genres = 4
top_k_predictions = predict_top_k(knn, X_test_scaled, k=k_genres)

genre_list = []

correct_count = 0
for i, top_k in enumerate(top_k_predictions):
    genre_list.append(y_test.iloc[i])
    if y_test.iloc[i] in top_k:
        correct_count += 1
accuracy_top_k = correct_count / len(y_test)

print("Genre List: ", genre_list)

print(f"Top {k_genres} Accuracy: {accuracy_top_k}")

Top k predictions:  [[1, 5], [3, 7, 6, 8], [7, 0, 6, 8], [2, 6, 0, 8], [4, 6, 3, 5], [6, 0, 2, 7], [8, 7, 0, 3], [6, 4, 7, 2], [1, 5], [3, 6, 0, 7], [2, 7, 0, 6], [5, 6, 8, 4], [2, 5, 6, 0], [3, 7, 0, 6], [6, 2, 7, 3], [7, 8, 3, 0], [5, 2, 0, 6], [7, 8, 3, 6], [7, 3, 0, 8], [4, 6, 5, 2], [7, 8, 3, 6], [7, 2, 0, 8], [6, 3, 0, 4], [2, 6, 5, 0], [7, 3, 2, 0], [8, 1, 5, 6], [3, 0, 1, 6], [3, 7, 6, 4], [6, 0, 4, 3], [7, 8, 0, 3], [3, 7, 8], [0, 3, 8, 7], [7, 8, 1, 6], [6, 4, 0, 7], [2, 7, 3, 6], [1, 5, 6, 3], [7, 6, 0, 2], [7, 6, 3, 8], [3, 7], [6, 0, 7, 3], [7, 3, 8, 6], [8, 7, 6, 3], [6, 3, 5, 2], [7, 8, 3, 0], [6, 3, 7, 4], [7, 0, 8, 6], [0, 2, 6, 7], [5, 3, 6, 7], [1, 5], [4, 6, 8, 7], [6, 2, 0, 3], [3, 7, 8, 2], [7, 8, 0, 3], [8, 7, 2], [0, 7, 8, 6], [6, 2, 0, 8], [1, 5], [6, 5, 3, 0], [8, 7, 1, 0], [8, 7, 3], [1, 5], [8, 0, 7, 1], [6, 3, 4, 7], [7, 3, 0, 8], [3, 2, 6, 0], [6, 4, 7, 3], [8, 7, 5, 3], [7, 0, 8, 6], [8, 3, 7, 0], [3, 8, 7, 6], [4, 6, 2, 3], [0, 7, 8, 5], [6, 2, 0, 3], [0

## KNN with Selected Genre

In [6]:
selected_genres = [
    'acoustic',
    # 'afrobeat',
    # 'alt-rock',
    # 'alternative',
    'ambient',
    # 'anime',
    # 'black-metal',
    # 'bluegrass',
    # 'blues',
    # 'brazil',
    # 'breakbeat',
    # 'british',
    # 'cantopop',
    # 'chicago-house',
    # 'children',
    # 'chill',
    # 'classical',
    'club',
    # 'comedy',
    # 'country',
    # 'dance',
    # 'dancehall',
    # 'death-metal',
    # 'deep-house',
    # 'detroit-techno',
    # 'disco',
    # 'disney',
    # 'drum-and-bass',
    # 'dub',
    # 'dubstep',
    # 'edm',
    # 'electro',
    # 'electronic',
    # 'emo',
    # 'folk',
    # 'forro',
    # 'french',
    # 'funk',
    # 'garage',
    # 'german',
    # 'gospel',
    # 'goth',
    # 'grindcore',
    # 'groove',
    # 'grunge',
    # 'guitar',
    # 'happy',
    # 'hard-rock',
    # 'hardcore',
    # 'hardstyle',
    # 'heavy-metal',
    'hip-hop',
    # 'honky-tonk',
    # 'house',
    # 'idm',
    # 'indian',
    # 'indie-pop',
    # 'indie',
    # 'industrial',
    # 'iranian',
    # 'j-dance',
    # 'j-idol',
    # 'j-pop',
    # 'j-rock',
    # 'jazz',
    # 'k-pop',
    # 'kids',
    # 'latin',
    # 'latino',
    # 'malay',
    # 'mandopop',
    'metal',
    # 'metalcore',
    # 'minimal-techno',
    # 'mpb',
    # 'new-age',
    # 'opera',
    # 'pagode',
    # 'party',
    # 'piano',
    # 'pop-film',
    # 'pop',
    # 'power-pop',
    # 'progressive-house',
    # 'psych-rock',
    # 'punk-rock',
    # 'punk',
    # 'r-n-b',
    # 'reggae',
    # 'reggaeton',
    # 'rock-n-roll',
    # 'rock',
    # 'rockabilly',
    # 'romance',
    # 'sad',
    # 'salsa',
    # 'samba',
    # 'sertanejo',
    # 'show-tunes',
    # 'singer-songwriter',
    # 'ska',
    # 'sleep',
    # 'soul',
    # 'spanish',
    # 'study',
    # 'swedish',
    # 'synth-pop',
    # 'tango',
    # 'techno',
    # 'trance',
    # 'trip-hop',
    # 'turkish',
    # 'world-music'
]

print(len(selected_genres))

5


In [10]:
filtered_tracks = tracks[tracks['genre'].isin(selected_genres)].copy()
filtered_track_data = filtered_tracks[~filtered_tracks['track_id'].isin(playlist['track_id'])].copy()

encoder_filter = LabelEncoder()
filtered_track_data['genre'] = encoder_filter.fit_transform(filtered_track_data['genre'])

features = filtered_track_data[['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                       'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                       'time_signature']]
# features = filtered_track_data[['energy', 'key', 'mode', 
#                        'acousticness', 'valence', 'tempo']]
target = filtered_track_data['genre']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn_filtered = KNeighborsClassifier(n_neighbors=5)
knn_filtered.fit(X_train_scaled, y_train)

y_pred = knn_filtered.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.722972972972973


In [11]:
k_genres = 3
top_k_predictions = predict_top_k(knn_filtered, X_test_scaled, k=k_genres, n_neighbors=10)

genre_list = []

correct_count = 0
for i, top_k in enumerate(top_k_predictions):
    genre_list.append(y_test.iloc[i])
    if y_test.iloc[i] in top_k:
        correct_count += 1
accuracy_top_k = correct_count / len(y_test)

print("Genre List: ", genre_list)

print(f"Top {k_genres} Accuracy: {accuracy_top_k}")

NameError: name 'predict_top_k' is not defined

In [25]:
playlist_features = playlist[['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                                     'time_signature']]
playlist_features_scaled = scaler.transform(playlist_features)

# Predict genre of playlist
playlist_genre_predictions = knn.predict(playlist_features_scaled)
predicted_genres = encoder.inverse_transform(playlist_genre_predictions)
playlist['predicted_genre'] = predicted_genres
print(playlist[['track_name', 'predicted_genre']])

# Predict genre of playlist with selected genres
playlist_genre_predictions_filtered = knn_filtered.predict(playlist_features_scaled)
predicted_genres_filtered = encoder_filter.inverse_transform(playlist_genre_predictions_filtered)
playlist['predicted_genre'] = predicted_genres_filtered
print(playlist[['track_name', 'predicted_genre']])


                                         track_name predicted_genre
0                                     Getting Older        alt-rock
1                                     Afternoon Tea          brazil
2                                         Tennessee           blues
3   Apokaliptyczny Młot (Obliteracja Poprzez Życie)        acoustic
4                                Au Bord Du Gouffre        acoustic
..                                              ...             ...
95                                colors of a night          brazil
96                                          roasted        afrobeat
97                                    frozen grapes          brazil
98                              I Wanna Be A Writer          brazil
99                   Open the Gates O Forest Keeper     black-metal

[100 rows x 2 columns]
                                         track_name predicted_genre
0                                     Getting Older          comedy
1                       

## Save the Model using `pickle`

In [16]:
import pickle

In [18]:
filename = 'knn_model.pkl'
pickle.dump(knn, open(filename, 'wb'))