# Unpickling DF

In [1]:
import pickle

In [2]:
df = pickle.load(open("./data/song_list_v5_hashed.pkl","rb"))

In [6]:
df = df[df.columns[0:18]]

In [7]:
df.head()

Unnamed: 0,songid,popularity,acousticness,artist,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track,valence,genres
0,5PS5dpaLogPzYU9hWiWyZb,0,0.0013,Karkkiautomaatti,0.487,157307.0,0.678,0.0551,9.0,0.0846,-7.78,1.0,0.0495,149.94,4.0,Tanssi vaan,0.87,"['finnish indie', 'suomi rock']"
1,41RpZW2lxAdnqDd2nMBzLQ,15,4.5e-05,Hudson Mohawke,0.662,138960.0,0.823,0.952,4.0,0.343,-1.711,0.0,0.0662,177.745,4.0,No One Could Ever,0.621,"['bass music', 'scottish electronic', 'scottis..."
2,3GsS8jzoixpCnp4jDWCEvb,17,0.00276,Kris Kross,0.859,221200.0,0.741,0.0,11.0,0.325,-12.329,0.0,0.271,98.082,4.0,2 Da Beat Ch'yall,0.529,"['hip hop', 'new jack swing', 'rap']"
3,0kq75szR7uDEYrZkT2c4Ry,21,0.348,Jorge Negrete,0.361,173573.0,0.483,3e-06,7.0,0.177,-6.875,1.0,0.0287,94.538,4.0,El hijo del pueblo,0.682,"['bolero', 'mariachi', 'ranchera']"
4,2HyFpkX9J7vv3OZNDaraHZ,1,0.34,Tiger Lou,0.533,318467.0,0.302,0.179,10.0,0.111,-10.308,1.0,0.0307,134.959,3.0,Pilots,0.294,['swedish indie rock']


# Create Genre List

In [125]:
#clean up the genre list column for string matching formatting consistency

new_genres_list = []
for index,genre_list in enumerate(df["genres"]):
    genre_list = genre_list.split(",")
    new_genre_list = []
    for genre in genre_list:
        genre = genre.strip("]")
        genre = genre.strip("[")
        genre = genre.strip(" ")
        genre = genre.strip("'")
        new_genre_list.append(genre)
    new_genres_list.append(new_genre_list)

In [129]:
df["genres"] = new_genres_list

In [133]:
all_genres = []
for genres in df["genres"]:
    for genre in genres:
        all_genres.append(genre)

In [135]:
len(all_genres)

2110517

# Limited Embeddding

Create bitwise embedding for fast vector addition of arbitrary elements in a set

> Mimic inverted indices in DB

In [136]:
## Get a set of items

items = sorted(list(set(all_genres)))
items[0:10]

['21st century classical',
 '432hz',
 '8-bit',
 'a cappella',
 'aarhus indie',
 'aberdeen indie',
 'abstract',
 'abstract beats',
 'abstract hip hop',
 'abstract idm']

In [137]:
## Create a lookup table for these items

import numpy as np

def create_lookup(item_set):
    lookup = {}  # initialize empty lookup
    max_len = len(item_set)  # get size of empty array
    base_array = np.zeros(max_len)  # initialize an empty array to copy for each embedding/vector
    
    # Iterate through each item in set and create unique embedding, storing embeddings in dictionary
    for index, item in enumerate(item_set):
        temp_array = base_array.copy()
        temp_array[index] = 1
        lookup[item] = temp_array
        
    return lookup

lookup_dict = create_lookup(items)

In [138]:
for item in list(lookup_dict.items())[0:10]:
    print(item[1])

[1. 0. 0. ... 0. 0. 0.]
[0. 1. 0. ... 0. 0. 0.]
[0. 0. 1. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


In [139]:
def compress_list(items, lookup_dict=lookup_dict):
    vec_list = np.array([lookup_dict[item] for item in items])
    return np.sum(vec_list, axis=0)

In [81]:
## Example usage

# Given two rows of data:

# row_a = ['a', 'c']
# row_b = ['a', 'b']
# row_c = ['a', 'c', 'd']
# row_d = ['b', 'c', 'd']

# # Calculate their respective vectors

# vec_a = compress_list(row_a)
# vec_b = compress_list(row_b)
# vec_c = compress_list(row_c)
# vec_d = compress_list(row_d)

# vec_a

In [140]:
df['genre_embed'] = df.genres.apply(compress_list)

In [144]:
df['genre_embed'][0]

array([0., 0., 0., ..., 0., 0., 0.])

In [147]:
for index,val in enumerate(df['genre_embed'][0]):
    if val == 1:
        print(index,val)

1226 1.0
3065 1.0


In [164]:
for index,val in enumerate(df['genre_embed'][0]):
    if val == 1:
        print(index,val)

1226 1.0
3065 1.0


In [159]:
df["genre_embed"][0][1226]

1.0

In [160]:
df["genres"].loc[1226]

['dancehall', 'lovers rock', 'modern reggae', 'riddim', 'soca']

In [153]:
df["genres"].loc[3065]

['acoustic blues',
 'blues',
 'country blues',
 'piedmont blues',
 'traditional blues']

In [151]:
df["genres"][0]

['finnish indie', 'suomi rock']

In [162]:
items[3065]

'suomi rock'

# Pickling to Try out on KNN model in another notebook w/ Genre Embeds

In [167]:
# pickle.dump(df, open( "./data/song_list_v6_genre_embeds.pkl", "wb" ) )

# Pickling without Genre Embeds

In [175]:
pickle.dump(df[df.columns[0:18]],open( "./data/song_list_v6", "wb" ))

### Using with Pandas

compress_list can be applied to your genre series via:
> df['genre_embed'] = df.genres.apply(compress_list)

## Compare vectors

Use cosine distance to compare vectors (will be more meaningful if set is pre-ordered by similarity prior to making lookup_dict)

In [26]:
from scipy.spatial.distance import cosine

cosine(vec_a, vec_a), cosine(vec_a, vec_b), cosine(vec_a, vec_c), cosine(vec_b, vec_c), cosine(vec_a, vec_d), cosine(vec_c, vec_d)

(0.0,
 0.5,
 0.18350341907227385,
 0.5917517095361369,
 0.5917517095361369,
 0.33333333333333337)

In [None]:
# counter = -1 
for index,genre_list in enumerate(df["genres"][0:1000]):
#     counter += 1 
    genre_list = genre_list.split(",")
    for genre in genre_list:
        genre = genre.strip("]")
        genre = genre.strip("[")
        genre = genre.strip(" ") 
        print(genre)
# #         genre = '"' + genre + '"'
#         for column in df.columns[18:]:
#             if column == genre:
#                 print(column,genre)
# #                 df[column].loc[counter] = 1
#             else:
#                 continue

### Comparing cosine similarity

Now that you have multi-dimensional vectors, you can quickly compute cosine similarity

### Other Options

> k-means or kd-tree (very fast, gives neighbors)
> neural network matching (various types)
> logistic regression (use scores.  very good in combination with clustering.  very fast)