# Data pre-processing notebook
### Dropping non-songs, encoding categorical variables

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce

df = pd.read_csv("Data/tracks.csv")
# df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data/tracks.csv'

Data cleanup

In [None]:
!pip install category_encoders



In [None]:
df.columns

Index(['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
       'id_artists', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature'],
      dtype='object')

In [4]:

def new_clean_data(df):
   # Drop the id column, name, release_date
   x = df.copy()
   drop_cols = ['id', 'name', 'release_date', 'id_artists', 'artists']
   x = x.drop(labels=drop_cols, axis=1)
    
   # Drop zero tempo songs
   x = x[x['tempo']!=0]

   # Create categorical variables: 
   #     splits each feature into 5 categories (bins)
   #     key, explicit not included
   to_cat = ['popularity', 'duration_ms', 'danceability', 'energy',
      'loudness', 'speechiness', 'acousticness', 'instrumentalness',
      'liveness', 'valence', 'tempo', 'time_signature']
   for col in to_cat:
      x[col] = pd.cut(x[col], bins=5, labels=False)
 
   #  One-hot encoding of all features
   cols = ['popularity', 'duration_ms', 'danceability', 'energy',
      'loudness', 'speechiness', 'acousticness', 'instrumentalness',
      'liveness', 'valence', 'tempo', 'time_signature', 'key', 'explicit']
   ohe = ce.one_hot.OneHotEncoder(cols=cols)
   x = ohe.fit_transform(x)
    
   return x


In [5]:
new_clean_df = new_clean_data(df)
new_clean_df.head()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,popularity_1,popularity_2,popularity_3,popularity_4,popularity_5,duration_ms_1,duration_ms_2,duration_ms_3,duration_ms_4,duration_ms_5,...,valence_5,tempo_1,tempo_2,tempo_3,tempo_4,tempo_5,time_signature_1,time_signature_2,time_signature_3,time_signature_4
0,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [6]:
new_clean_df.describe()
new_clean_df.columns

Index(['popularity_1', 'popularity_2', 'popularity_3', 'popularity_4',
       'popularity_5', 'duration_ms_1', 'duration_ms_2', 'duration_ms_3',
       'duration_ms_4', 'duration_ms_5', 'explicit_1', 'explicit_2',
       'danceability_1', 'danceability_2', 'danceability_3', 'danceability_4',
       'danceability_5', 'energy_1', 'energy_2', 'energy_3', 'energy_4',
       'energy_5', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6',
       'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'key_12', 'loudness_1',
       'loudness_2', 'loudness_3', 'loudness_4', 'loudness_5', 'mode',
       'speechiness_1', 'speechiness_2', 'speechiness_3', 'speechiness_4',
       'speechiness_5', 'acousticness_1', 'acousticness_2', 'acousticness_3',
       'acousticness_4', 'acousticness_5', 'instrumentalness_1',
       'instrumentalness_2', 'instrumentalness_3', 'instrumentalness_4',
       'instrumentalness_5', 'liveness_1', 'liveness_2', 'liveness_3',
       'liveness_4', 'liveness_5', 'valence_1', 'v

In [7]:
new_clean_df.to_csv('Data/new_songs_cleaned.csv')

In [8]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors


# Load CSV files
# Embeddings
emb_file = '../../data/embeddings_df_001.csv' # USE CORRECT PATH
embeddings = pd.read_csv(emb_file)
# Drop extra index column
embeddings.drop('Unnamed: 0', axis=1, inplace=True)
# Tracks
track_file = '../../data/tracks.csv'
tracks = pd.read_csv(track_file) # USE CORRECT PATH

In [47]:
def find_neighbors(song):
    '''
    Find the nearest neighbors of a song
    1. Checks for song
    2. Loads and process the embeddings into an array
    3. Trains a nearest neighbors model
    4. Finds the 10 nearest neighbors of the given song
    ARGUMENTS: song in string form
    RETURNS: list of indices
    '''
    # 1. Check if song exists: if yes, use first result
    songs = tracks.index[tracks.name == song]
    if len(songs) <1:
        return 'ERROR: Not a valid song name' 
    else:
        song_index = songs[0]

    # 2. Prepare song embeddings data
    # Convert dataframe to numpy array
    encoded_songs = embeddings.to_numpy()

    # 3. Train nearest neighbors model on encodings
    # Number of neighbors
    n = 11
    nn = NearestNeighbors(n_neighbors=n, algorithm='ball_tree')
    nn.fit(encoded_songs)

    # 4. Get neigbors of song
    test_encoding = encoded_songs[song_index].reshape(1,-1)
    _, n_indices = nn.kneighbors(test_encoding)
    # Prepare indices
    n_indices = n_indices.tolist()[0]
    # Remove search song if present
    for i in n_indices:
        if i == song_index:
            index = n_indices.index(i)
            n_indices.pop(index)
    # Add search song index at beginning
    n_indices.insert(0, song_index)

    # FIRST INDEX IS SEARCH SONG!
    return n_indices

In [48]:
neighbors = find_neighbors('Twist and Shout')

In [None]:
for i in range(11):
    print(tracks.iloc[neighbors[i]]['name'])

Twist and Shout
Twist and Shout
Wherever I Lay My Hat (That's My Home)
Pars
Tari Topeng
Dos Canciones Populares Catalanas
Pala-Pala (Tottemic Dance)
Catch the Wind (Single Version with Strings)
Pare de Agir Assim
Good Hearted Woman - Live in Texas - September 1974
Mama Mama Mama
