## KNN from song_list_v7_binned.pkl

In [1]:
import ast
from joblib import dump, load
import numpy as np
import pickle
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, OrdinalEncoder

##### Read in data(frame)

In [2]:
song_df = pickle.load(open(r'C:\Users\mjhar\Downloads\song_list_v7_binned.pkl','rb'))

In [3]:
song_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555160 entries, 0 to 555159
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   songid            555160 non-null  object  
 1   availability      555160 non-null  float64 
 2   year              555160 non-null  int64   
 3   quin_bins         555160 non-null  category
 4   popularity        555160 non-null  object  
 5   acousticness      555160 non-null  float64 
 6   artist            555160 non-null  object  
 7   danceability      555160 non-null  float64 
 8   duration_ms       555160 non-null  float64 
 9   energy            555160 non-null  float64 
 10  instrumentalness  555160 non-null  float64 
 11  key               555160 non-null  float64 
 12  liveness          555160 non-null  float64 
 13  loudness          555160 non-null  float64 
 14  mode              555160 non-null  float64 
 15  speechiness       555160 non-null  float64 
 16  te

##### Replacing labels to be easily sorted for OrdinalEncoder

In [4]:
replacements = {
    '90-95' : '1990-1995',
    '95-2000' : '1995-2000',
    '75-80' : '1975-1980',
    '85-90' : '1985-1990',
    '80-85' : '1980-1985',
    '50-60' : '1950-1960',
    '60-65' : '1960-1965',
    '>50s' : '0-1950',
    '65-70' : '1965-1970',
    '70-75' : '1970-1975',
    '2020' : '2020-2021'
}

for key, val in replacements.items():
    song_df['quin_bins'] = song_df['quin_bins'].replace(key,val)

##### Defining data columns by data type

In [5]:
cat_cols = ['quin_bins']
num_cols = ['acousticness', 'danceability', 'energy', 'instrumentalness', 
            'liveness', 'loudness', 'mode', 'speechiness',
            'tempo', 'time_signature', 'valence']

##### Split columns by data type for processing

In [6]:
num_df = song_df[num_cols]
cat_df = song_df[cat_cols]

##### Apply processors to data and combine

In [7]:
encoder = OrdinalEncoder()                
enc_arr = encoder.fit_transform(cat_df)

In [8]:
scaler = MinMaxScaler()                  
scale_arr = scaler.fit_transform(num_df)

In [9]:
processed_data = np.concatenate((enc_arr, scale_arr),axis=1)

##### Normalize rows for NN

In [10]:
normalizer = Normalizer()                             
model_data = normalizer.fit_transform(processed_data)

##### Instantiate  and fit model

In [11]:
model = NearestNeighbors(n_neighbors=10000,algorithm='kd_tree',n_jobs=-1)

In [12]:
nn = model.fit(model_data)

##### Test output

In [13]:
results = nn.kneighbors([model_data[377]])

In [14]:
results[1]

array([[   377, 343442, 215261, ...,  39142, 261980,  67983]], dtype=int32)

In [15]:
song_df.iloc[377][['artist', 'track', 'genres']]

artist            Jimmy Nail
track               W.L.T.M.
genres    ['classic uk pop']
Name: 377, dtype: object

In [16]:
song_df.iloc[343442][['artist', 'track', 'genres']]

artist                                           Carey Bell
track                                         Just Like You
genres    ['blues', 'chicago blues', 'electric blues', '...
Name: 343442, dtype: object

##### Genre matching nn results

In [17]:
string="['cartoon','children's music']"

def remove_apos(string):
    aidx = []
    for idx, elem in enumerate(string):
    
        if elem == ',':
        
            if len(aidx)%2 == 0:
                aidx = []
                continue
        
            else:
                a_idx = aidx[1]
                string = string[:a_idx] + string[a_idx+1:]
                aidx = []
                continue
    
        elif elem == ']':
            if len(aidx)%2 == 0:
                continue
            
            else:
                a_idx = aidx[1]
                string = string[:a_idx] + string[a_idx+1:]
                continue
            
    
        elif elem == "'":
            aidx.append(idx)
            
    return string
    
remove_apos(string)

"['cartoon','childrens music']"

In [18]:
def genre_match(seed_genre_string, nn_arr, song_df):
    
    genre_matches_idx = []

    try:
        seed_genres = set(ast.literal_eval(seed_genre_string))

    except SyntaxError:
        cleaned = remove_apos(seed_genre_string)
        seed_genres = set(ast.literal_eval(cleaned))


    for idx in nn_arr[1][0]:
    
        try:
            nn_genres = set(ast.literal_eval(song_df.iloc[idx]['genres']))
    
        except SyntaxError:
            cleaned = remove_apos(song_df.iloc[idx]['genres'])
            nn_genres = set(ast.literal_eval(cleaned))
    
        matched = set.intersection(seed_genres, nn_genres)
    
        if matched:
            genre_matches_idx.append(idx)
        
    return genre_matches_idx

In [19]:
genre_match(song_df.iloc[377]['genres'], results, song_df)

[377,
 120891,
 107544,
 297416,
 228415,
 76407,
 373865,
 355249,
 247937,
 470351,
 204131,
 85465,
 381167,
 309803,
 79666,
 204008,
 163699,
 127817,
 341804,
 9142,
 68119,
 34104,
 270279,
 263760,
 384389,
 461414,
 9379,
 466404,
 334471,
 316919,
 135223,
 1371,
 251463,
 154400,
 134000,
 218216,
 17401,
 15270,
 264462,
 257621,
 299340,
 262154,
 203873,
 341931,
 168047,
 173441,
 145724,
 335586,
 45277,
 418669,
 157736,
 287878,
 326530,
 299418,
 169433,
 95108,
 433455,
 449542,
 269455,
 446186,
 22312,
 224475,
 227869,
 39208,
 445237,
 73777,
 148942,
 183150,
 141703,
 426948,
 119420,
 32083,
 363708,
 431379,
 33904,
 60250,
 50497,
 252545,
 388363,
 48218,
 142165,
 16029,
 309500,
 402980,
 321264,
 373049,
 15068,
 345643,
 75242,
 340906,
 285161,
 246353,
 441634,
 334833,
 246251,
 139949,
 323644,
 96570,
 450052,
 34304,
 461559,
 361826,
 214005,
 76507,
 163320,
 306030,
 27450,
 228933,
 423313,
 247015,
 430371,
 9104,
 33628,
 343394,
 55784,
 2

In [None]:
#dump(nn, 'nn-bins.joblib')

#### Graveyard

##### unused pipeline attempt

In [None]:
category_pipeline = make_pipeline(
    OrdinalEncoder())

numeric_pipeline = make_pipeline(
    MinMaxScaler())

processor = make_column_transformer(
    (category_pipeline, cat_cols),
    (numeric_pipeline, num_cols),
    )

nn_pipeline = make_pipeline(
    processor,
    NearestNeighbors(n_neighbors=10000,algorithm='kd_tree',n_jobs=-1))