# Spotify Recommendation

Before running the following code, ensure you have read the "Usage" section of the README.md file explaining how to use this notebook.

In [23]:
# Import necessary packages
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [24]:
# Returns a cleaned version of the given DataFrame (leaving the original unchanged)
def clean_data(df):
    
    # Create a deep copy of the DataFrame to ensure the original is unedited
    clean_df = df.copy()

    # Find duplicate tracks
    print('Finding duplicates...')
    dupe_mask = clean_df['track_id'].duplicated(keep=False)
    dupes = clean_df[dupe_mask].sort_values('track_id')
    print(f' - Total duplicated rows: {len(dupes)}')
    num_duplicated_songs = clean_df['track_id'].duplicated().sum()
    print(f' - Number of songs with duplicates: {num_duplicated_songs}')
    print('Done')

    # Remove duplicate tracks
    num_rows_before = len(clean_df)
    print(f'\nCurrent number of rows in DataFrame: {num_rows_before}')
    print('\nRemoving duplicates...')
    clean_df = clean_df.drop_duplicates(subset='track_id', keep='first')
    print('Done')
    num_rows_after = len(clean_df)
    print(f'\nNew number of rows in DataFrame: {num_rows_after}')
    print(f'Removed {num_rows_before - num_rows_after} duplicates')
    print(f'Number of unique track_ids: {clean_df['track_id'].nunique()}')

    # Drop unnecessary columns
    print('\nDropping the unnecessary columns "track_id" and "Unnamed: 0"...')
    clean_df = clean_df.drop('track_id', axis=1)
    clean_df = clean_df.drop('Unnamed: 0', axis=1)
    print('Done')

    return clean_df

In [25]:
# Create a DataFrame object from the dataset (in CSV format)
unclean_df = pd.read_csv('dataset.csv')

# Clean the data to remove duplicates and unnecessary columns
clean_df = clean_data(unclean_df)
clean_df = clean_df.reset_index(drop=True) # Resets indices to account for dropped rows

# Separate numeric columns
numeric_cols = [
    'duration_ms', 'danceability', 
    'energy', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness',
    'liveness', 'valence', 'tempo'
] # Note: 'popularity' was excluded since it varies with time and is therefore less useful as a feature

# Scale the numeric data to give each feature equal importance
scaler = StandardScaler()
scaled_cols = scaler.fit_transform(clean_df[numeric_cols])
scaled_df = pd.DataFrame(scaled_cols, columns=numeric_cols)

# Separate categorical columns
categorical_cols = [ 'key', 'time_signature' ]
# Note: 'artists', 'album_name', 'track_name', and 'track_genre' were
#       excluded to avoid an unnecessarily high dimension DataFrame.

# Apply One-Hot Encoding to categorical data to include them as numeric features
encoded_df = pd.get_dummies(
    clean_df[categorical_cols], columns=categorical_cols, drop_first=True, dtype=int
)

# Separate binary columns
binary_cols = [ 'explicit', 'mode' ]
binary_df = clean_df[binary_cols].astype(int) # Casts bool values to 0 or 1

# Combine into one DataFrame
df = pd.concat([scaled_df, encoded_df, binary_df], axis=1)

# Show a random sample of 10 tracks to view the structure of the DataFrame
df.sample(10)

Finding duplicates...
 - Total duplicated rows: 40900
 - Number of songs with duplicates: 24259
Done

Current number of rows in DataFrame: 114000

Removing duplicates...
Done

New number of rows in DataFrame: 89741
Removed 24259 duplicates
Number of unique track_ids: 89741

Dropping the unnecessary columns "track_id" and "Unnamed: 0"...
Done


Unnamed: 0,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,key_8,key_9,key_10,key_11,time_signature_1,time_signature_3,time_signature_4,time_signature_5,explicit,mode
39783,-0.314916,0.615961,1.120569,0.828122,-0.3632,-0.943397,-0.535481,2.283574,1.478047,-0.199829,...,0,0,0,1,0,0,1,0,0,0
40114,-0.009073,1.063072,-2.067227,-1.969563,-0.266975,1.710549,1.836015,-0.564284,-1.131686,0.328371,...,0,0,0,0,0,0,1,0,0,0
38758,-0.051801,-0.374473,1.408952,0.784456,0.499291,-0.970322,-0.534095,1.052068,-0.85017,0.263791,...,0,0,0,0,0,0,1,0,1,1
70770,-0.262856,-1.461121,1.038731,0.142298,-0.240492,-0.96725,-0.53432,-0.302589,0.272092,0.834757,...,0,1,0,0,0,0,1,0,0,1
71622,-0.076335,-1.302651,-0.800682,0.023557,-0.518572,0.081908,-0.535385,-0.795705,-0.865387,0.200306,...,0,0,0,0,0,0,1,0,0,1
13631,-1.185651,1.119668,-0.937079,-0.340325,-0.161923,0.259255,-0.535481,-0.878318,1.862279,0.259242,...,0,0,0,0,0,0,1,0,0,1
51922,-0.493406,0.746133,0.816599,0.749408,3.068228,-0.09544,-0.535481,-0.564284,0.880776,-0.495173,...,0,1,0,0,0,1,0,0,0,0
28664,-0.160507,-0.153748,-0.438255,0.120465,-0.45148,-0.583145,-0.535481,1.134168,-0.625717,-1.494298,...,0,0,0,0,0,0,1,0,0,1
39556,-0.191911,-0.549921,-0.695461,0.363118,-0.522986,0.667154,-0.535481,-0.543759,-0.671369,0.555217,...,0,0,0,0,0,0,1,0,0,0
35577,-0.848572,1.198903,0.033289,0.322516,2.776905,-0.506294,-0.535458,-0.035763,0.557412,0.615814,...,1,0,0,0,0,0,1,0,0,1
