# Spotify Recommendation

This notebook contains an initial KNN/RF combined implementation of the spotify recommendation system. The Spotify Recommendation NN.ipynb notebook contains the lastest implementation used and should be referenced.

In [1]:
# Import necessary packages
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [2]:
# Returns a cleaned version of the given DataFrame (leaving the original unchanged)
def clean_data(df):
    
    # Create a deep copy of the DataFrame to ensure the original is unedited
    clean_df = df.copy()

    # Find duplicate tracks
    print('Finding duplicates...')
    dupe_mask = clean_df['track_id'].duplicated(keep=False)
    dupes = clean_df[dupe_mask].sort_values('track_id')
    print(f' - Total duplicated rows: {len(dupes)}')
    num_duplicated_songs = clean_df['track_id'].duplicated().sum()
    print(f' - Number of songs with duplicates: {num_duplicated_songs}')
    print('Done')

    # Remove duplicate tracks
    num_rows_before = len(clean_df)
    print(f'\nCurrent number of rows in DataFrame: {num_rows_before}')
    print('\nRemoving duplicates...')
    clean_df = clean_df.drop_duplicates(subset='track_id', keep='first')
    print('Done')
    num_rows_after = len(clean_df)
    print(f'\nNew number of rows in DataFrame: {num_rows_after}')
    print(f'Removed {num_rows_before - num_rows_after} duplicates')
    print(f'Number of unique track_ids: {clean_df['track_id'].nunique()}')

    # Drop unnecessary column
    print('\nDropping the unnecessary column "Unnamed: 0"...')
    clean_df = clean_df.drop('Unnamed: 0', axis=1)
    print('Done')

    return clean_df

In [3]:
# Create a DataFrame object from the dataset (in CSV format)
unclean_df = pd.read_csv('dataset.csv')

# Clean the data to remove duplicates and unnecessary column
clean_df = clean_data(unclean_df)
clean_df = clean_df.reset_index(drop=True) # Resets indices to account for dropped rows

# Separate numeric columns
numeric_cols = [
    'popularity', 'duration_ms', 'danceability', 
    'energy', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness',
    'liveness', 'valence', 'tempo'
]

# Scale the numeric data to give each feature equal importance
scaler = StandardScaler()
scaled_cols = scaler.fit_transform(clean_df[numeric_cols])
scaled_df = pd.DataFrame(scaled_cols, columns=numeric_cols)

# Separate categorical columns
categorical_cols = [ 'key', 'time_signature' ]
# Note: 'artists', 'album_name', 'track_name', and 'track_genre' were
#       excluded to avoid an unnecessarily high dimension DataFrame.

# Apply One-Hot Encoding to categorical data to include them as numeric features
encoded_df = pd.get_dummies(
    clean_df[categorical_cols], columns=categorical_cols, drop_first=True, dtype=int
)

# Separate binary columns
binary_cols = [ 'explicit', 'mode' ]
binary_df = clean_df[binary_cols].astype(int) # Casts bool values to 0 or 1

# Combine into one DataFrame
df = pd.concat([scaled_df, encoded_df, binary_df], axis=1)

# Show a random sample of 10 tracks to view the structure of the DataFrame
df.sample(10)

Finding duplicates...
 - Total duplicated rows: 40900
 - Number of songs with duplicates: 24259
Done

Current number of rows in DataFrame: 114000

Removing duplicates...
Done

New number of rows in DataFrame: 89741
Removed 24259 duplicates
Number of unique track_ids: 89741

Dropping the unnecessary column "Unnamed: 0"...
Done


Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,key_8,key_9,key_10,key_11,time_signature_1,time_signature_3,time_signature_4,time_signature_5,explicit,mode
45398,-1.515907,0.173764,0.695196,1.069908,0.855126,-0.422348,-0.831136,-0.535481,0.682616,1.721521,...,0,0,0,0,0,0,1,0,0,0
6323,-0.73848,0.383464,-2.757741,0.738657,0.192859,0.004926,-0.970354,2.104661,-0.569415,-0.561045,...,0,0,0,1,0,0,1,0,0,0
75845,-0.203999,0.585683,0.9329,0.621745,0.097866,-0.397629,-0.154555,-0.535481,0.441446,1.694891,...,0,0,0,0,0,0,1,0,0,1
84892,-0.592712,0.399923,1.962952,0.430789,0.697124,-0.026855,-0.878133,0.431027,-0.975812,1.280225,...,0,0,0,1,0,0,1,0,1,0
51739,-0.544123,0.026527,-0.498985,0.649025,0.590066,0.861238,-0.823155,-0.535104,-0.559153,-0.66376,...,0,0,1,0,0,0,1,0,0,0
380,0.864963,-0.086579,-0.108471,-1.209874,-0.528395,-0.492971,0.365664,-0.535394,1.000755,-1.097448,...,0,0,0,0,0,0,1,0,0,1
38735,-0.252588,0.16175,0.706515,-0.960462,-0.767409,4.957411,0.70558,-0.535481,-0.554021,-0.971907,...,0,0,0,0,0,0,1,0,0,1
58124,0.573428,1.061773,0.383917,-0.937079,-0.297234,-0.047159,0.149891,-0.485457,-0.627912,0.435676,...,0,1,0,0,0,0,1,0,0,0
30687,0.038947,-0.536798,0.915921,0.220348,0.189028,1.205528,-0.898528,-0.535481,-0.518102,-1.093643,...,0,0,0,0,0,0,1,0,1,0
77129,0.864963,-0.157187,-0.255621,-0.625314,0.661885,-0.411754,1.344031,-0.535481,-0.220488,0.850342,...,0,0,0,0,0,1,0,0,0,1


In [4]:
# Initialize the KNN model
knn = NearestNeighbors(n_neighbors=10, metric='euclidean')

# Fit the KNN model
knn.fit(df)

In [5]:
# Read reference songs to recommend from
track_ids = []
print('Reading reference songs from "reference_songs.txt" file...')
try:
    with open('reference_songs.txt', 'r') as file:
        for line in file:
            track_ids.append(line.strip())
    print('Done')
except FileNotFoundError:
    print('Error: Could not find the "reference_songs.txt" file. Please create it and read "Usage" in the "README.md" file.')

# Use fitted KNN model to recommend similar songs
for track_id in track_ids:

    # Validate the track exists in the dataset
    if track_id not in clean_df['track_id'].values:
        print(f'Error: The track_id "{track_id}" was not found in the dataset.')
        continue

    # Find track information
    track = clean_df[clean_df['track_id'] == track_id]
    track_name = track['track_name']
    track_artists = track['artists']

    # Reshape for KNN model
    track_vector = df.iloc[track.index[0]].values.reshape(1, -1)

    # Find similar songs using the model
    distances, indices = knn.kneighbors(track_vector)
    neighbor_indices = indices[0][1:] # Skip the 0th index to prevent recommendation of the same track
    similar_tracks = clean_df.iloc[neighbor_indices]
    print(f'Here are some similar songs to "{track_name}" by {track_artists}')
    print('====================================================================================================')
    print(similar_tracks[['track_id', 'track_name', 'artists', 'album_name']])
    print('====================================================================================================')

Reading reference songs from "reference_songs.txt" file...
Done


In [8]:
# Use Random Forest Classifier to predict popularity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df.drop('popularity', axis=1)
y = clean_df['popularity'] >= 70

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

track_ids = []
print('Reading reference songs from "reference_songs.txt" file...')
try:
    with open('reference_songs.txt', 'r') as file:
        for line in file:
            track_ids.append(line.strip())
    print('Done')
except FileNotFoundError:
    print('Error: Could not find the "reference_songs.txt" file. Please create it and read "Usage" in the "README.md" file.')

# Combine popularity prediction with KNN recommendations
for track_id in track_ids:

    # Validate the track exists in the dataset
    if track_id not in clean_df['track_id'].values:
        continue

    # Find track information
    track = clean_df[clean_df['track_id'] == track_id]
    track_name = track['track_name']
    track_artists = track['artists']

    # Reshape for KNN model
    track_vector = df.iloc[track.index[0]].values.reshape(1, -1)

    # Find similar songs using the model
    distances, indices = knn.kneighbors(track_vector)
    neighbor_indices = indices[0][1:] # Skip the 0th index to prevent recommendation of the same track
    similar_tracks = clean_df.iloc[neighbor_indices]

    # Predict popularity for similar tracks
    similar_tracks_features = df.iloc[neighbor_indices]
    popularity_predictions = rf_classifier.predict(similar_tracks_features)

    print(f'Popularity predictions for songs similar to "{track_name}" by {track_artists}:')
    print('====================================================================================================')
    for i, pred in enumerate(popularity_predictions):
        popularity_label = 'Popular' if pred else 'Not Popular'
        recommended_track = similar_tracks.iloc[i]
        print(f'Track: "{recommended_track["track_name"]}" by {recommended_track["artists"]} - {popularity_label}')
    print('====================================================================================================')


Reading reference songs from "reference_songs.txt" file...
Done
