In [1]:
#Import Packages

# Data analysis
import pandas as pd
import numpy as np
import requests

# Data cleaning
import re

# Tokenizing words
import spacy
from spacy.tokenizer import Tokenizer
from collections import Counter

# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Encoding
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
from sqlalchemy import create_engine
import numpy as np
from sklearn import preprocessing  # for category encoder
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from typing import List, Tuple


## Wrangle

In [3]:
spot = pd.read_csv('https://raw.githubusercontent.com/rowaishanna/sp/master/Spotifyfeatures_reducedsize.csv')
print(spot.shape)
spot.head()

(165331, 18)


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,R&B,Mary J. Blige,Be Without You - Kendu Mix,2YegxR5As7BeQuVp2U6pek,65,0.083,0.724,246333,0.689,0.0,D,0.304,-5.922,Minor,0.135,146.496,4-Apr,0.693
1,R&B,Rihanna,Desperado,6KFaHC9G178beAp7P0Vi5S,63,0.323,0.685,186467,0.61,0.0,C,0.102,-5.221,Minor,0.0439,94.384,4-Mar,0.323
2,R&B,Yung Bleu,Ice On My Baby (feat. Kevin Gates) - Remix,6muW8cSjJ3rusKJ0vH5olw,62,0.0675,0.762,199520,0.52,4e-06,F,0.114,-5.237,Minor,0.0959,75.047,4-Apr,0.0862
3,R&B,Surfaces,Heaven Falls / Fall on Me,7yHqOZfsXYlicyoMt62yC6,61,0.36,0.563,240597,0.366,0.00243,B,0.0955,-6.896,Minor,0.121,85.352,4-Apr,0.768
4,R&B,Olivia O'Brien,Love Myself,4XzgjxGKqULifVf7mnDIQK,68,0.596,0.653,213947,0.621,0.0,B,0.0811,-5.721,Minor,0.0409,100.006,4-Apr,0.466


In [5]:
spot = spot.sample(frac=.2, axis = 0)
spot.shape

(16533, 18)

In [6]:
# Label encode genre
le= LabelEncoder()
spot['genre'] = le.fit_transform(spot['genre'])
spot['time_signature'] = le.fit_transform(spot['time_signature'])
spot['mode'] = le.fit_transform(spot['mode'])
spot['key'] = le.fit_transform(spot['key'])

# Copy dataframe
spot2=spot.copy()

In [7]:
# Combine text columns for tokenization
col_combine = ['artist_name', 'track_name']

# Lowercase and regex
for each in col_combine:
  spot2[each]= spot2[each].apply(lambda x:x.lower())
  spot2[each]= spot2[each].apply(lambda x: re.sub('[^a-zA-Z 0-9]', ' ', x))

# Combine two columns with text
spot2['combined_text'] = spot2['combined_text'] = spot2['artist_name'] + spot2['track_name'] 

# Remove repetitive columns
spot2= spot2.drop(['artist_name', 'track_name', 'track_id'], axis = 1)


In [8]:
spot2.head()

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,combined_text
96532,3,27,0.921,0.157,243000,0.0279,0.815,7,0.106,-26.778,0,0.0447,80.133,0,0.0746,anton n dvo kserenade f r streicher op 22 ...
82680,14,60,0.0383,0.837,379733,0.692,0.0,10,0.212,-4.251,0,0.337,87.775,0,0.805,eazy eboyz n the hood remix
44099,2,32,0.00123,0.429,408533,0.779,0.227,3,0.242,-8.108,0,0.0607,173.23,0,0.847,white denimny money
148429,18,38,0.745,0.576,206640,0.234,0.343,11,0.12,-15.522,1,0.0293,88.529,0,0.364,erykah baduincense
39634,2,41,0.504,0.404,184627,0.373,0.0,1,0.121,-11.233,0,0.0315,77.19,0,0.297,ty segallorange color queen


In [9]:
spot2_sub = spot2.drop(['combined_text'], axis = 1)
spot2_sub.shape

(16533, 15)

In [12]:
spot2_sub.head()

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
117925,12,54,0.00877,0.691,239773,0.929,0.0,8,0.472,-2.73,1,0.0645,130.899,1,0.574
76811,12,72,0.059,0.709,205820,0.718,0.0,9,0.167,-3.315,1,0.0312,104.971,1,0.24
54098,9,74,0.204,0.748,208733,0.705,0.0,7,0.246,-4.547,1,0.485,189.891,1,0.483
129593,16,27,0.119,0.869,280800,0.522,0.000149,5,0.0637,-14.378,0,0.238,83.987,1,0.672
114579,11,39,0.976,0.659,182360,0.113,0.88,1,0.179,-16.756,1,0.0569,95.063,1,0.738


In [10]:
scaler = StandardScaler()
scaler.fit(spot2_sub)
scaled_df = pd.DataFrame(scaler.transform(spot2_sub))
print(scaled_df.shape)
scaled_df.head()

(16533, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-1.263904,-1.281671,1.817394,-2.708192,0.035412,-2.474926,2.723119,0.470905,-0.553594,-3.546638,-0.770616,-0.453322,-1.25998,-0.347991,-1.68702
1,0.857295,0.878483,-0.874509,1.47192,1.166094,0.370084,-0.418345,1.334998,-0.037353,0.863813,-0.770616,0.963844,-1.00527,-0.347991,1.32256
2,-1.456741,-0.954375,-0.987558,-1.036147,1.404249,0.742793,0.456639,-0.681219,0.108752,0.10867,-0.770616,-0.375749,1.842979,-0.347991,1.495619
3,1.628641,-0.56162,1.28066,-0.132505,-0.265258,-1.591992,0.903768,1.623029,-0.485411,-1.342881,1.297663,-0.527987,-0.980138,-0.347991,-0.494561
4,-1.456741,-0.365242,0.545701,-1.189828,-0.44729,-0.996515,-0.418345,-1.257281,-0.480541,-0.503159,-0.770616,-0.51732,-1.358072,-0.347991,-0.770631


## Spacy

In [86]:
from pandas import Panel
from tqdm import tqdm
tqdm.pandas()

In [87]:
# Get lemmas

spot2['lemmas'] = spot2['combined_text'].progress_apply(get_lemmas)
spot2['lemmas'].head()

100%|██████████| 165331/165331 [23:31<00:00, 117.17it/s]


0    [mary, j,  , bligebe,   , kendu, mix2YegxR5As7...
1             [rihannadesperado6KFaHC9G178beAp7P0Vi5S]
2    [yung, bleuice, baby,  , feat,  , kevin, gates...
3    [surfacesheaven, fall,   , fall, me7yhqozfsxyl...
4    [olivia, o, brienlove, myself4XzgjxGKqULifVf7m...
Name: lemmas, dtype: object

### TFIDF

In [11]:
# Set up TFIDF
# Instantiate vectorizer object

def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

tfidf = TfidfVectorizer(
    stop_words = 'english',
    # tokenizer = tokenize,
    ngram_range = (1,1),
    min_df = 1, 
    max_df = 0.9,
    max_features = 1000)

In [12]:
# Create a vocabulary and tf-idf score per document
text = spot2['combined_text']
dtm = tfidf.fit_transform(text)

In [18]:
spot2.shape

(82666, 16)

In [19]:
# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(82666, 1000)


Unnamed: 0,10,11,12,120,13,14,15,16,17,18,20,2001,2002,2003,2006,2007,2009,2010,2011,2012,2013,2015,2016,2017,21,23,24,25,27,28,30,32,40,50,626,aaron,acoustic,act,adagio,adam,...,whiskey,white,whitney,wild,william,williams,willie,wilson,wind,wine,wish,wisin,wit,wiz,wolf,wolfgang,woman,women,won,wonder,wood,words,work,world,wrong,xavier,ya,yeah,year,years,yellow,yo,york,young,yung,zac,zero,ziggy,zion,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
combined_df=pd.concat([scaled_df, dtm], axis = 1)
combined_df.shape

(82666, 1015)

# Similarity Recommender
## A.  Nearest Neighbors

In [27]:
nn = NearestNeighbors(n_neighbors=6)

In [28]:
nn.fit(combined_df)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                 radius=1.0)

In [41]:
# looking for similar song to 1st entry in DF
a = nn.kneighbors([combined_df.iloc[0].values])

In [42]:
# Distances of the first 5 from inquired song
a[0]

array([[0.        , 0.77401582, 1.33124212, 1.3541948 , 1.39629676,
        1.40528003]])

In [74]:
# Song identifiers
a[1]

array([[    0, 11271,  7246, 59453, 68459, 69788]])

In [75]:
# Closest one

a[1][0][1]

11271

In [52]:
spot.columns

Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

In [73]:
print(spot['artist_name'][0], spot['track_name'][0])  # inquiry

print(spot['artist_name'][11271], spot['track_name'][11271])  # similar 1
print(spot['artist_name'][11271], spot['track_name'][11271])
#print(spot['artist_name'][68459], spot['track_name'][68459])

Mary J. Blige Be Without You - Kendu Mix
Modest Mouse The Ground Walks, with Time in a Box
Modest Mouse The Ground Walks, with Time in a Box


# Similarity Recommender
## Cosine Similarity

In [13]:
# Calculate Distance of TF-IDF Vectors
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# Calculate Distance of TF-IDF Vectors
dist_matrix  = cosine_similarity(dtm)

In [16]:
# Turn it into a DataFrame
cosine_df = pd.DataFrame(dist_matrix)
print(cosine_df.shape)
cosine_df.head()

(16533, 16533)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,16493,16494,16495,16496,16497,16498,16499,16500,16501,16502,16503,16504,16505,16506,16507,16508,16509,16510,16511,16512,16513,16514,16515,16516,16517,16518,16519,16520,16521,16522,16523,16524,16525,16526,16527,16528,16529,16530,16531,16532
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.378948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Verify it was added
leng = len(spot)-1
spot.iloc[leng]

genre                                           7
artist_name                                Tchami
track_name          Move Your Body (Future House)
track_id                   5fFEgmLAbNG75zKPn3hXOO
popularity                                     46
acousticness                              0.00424
danceability                                0.818
duration_ms                                219429
energy                                       0.87
instrumentalness                            0.196
key                                             0
liveness                                   0.0429
loudness                                   -5.162
mode                                            0
speechiness                                0.0493
tempo                                     123.999
time_signature                                  0
valence                                     0.741
Name: 25045, dtype: object

In [17]:
# # Grab the top 5 most similar strains to the custom strain at the start.
last_cosine = len(cosine_df)-1
cosine_results = cosine_df[cosine_df[0] < 1][last_cosine].sort_values(ascending=False)[1:6]
cosine_results =  pd.DataFrame(cosine_results)
cosine_results = cosine_results.reset_index()
cos_results = cosine_results['index'].values.tolist()
cos_results

[4678, 1803, 9873, 9472, 7437]

In [24]:
# Check results
print('----------------------------')
print('----------------------------')
print(f"Seed song:") 
print(f"{spot.iloc[leng]}")
print('----------------------------')
print('----------------------------')
print('Similar songs:')
print('----------------------------')
for each in cos_results:
  print(spot.iloc[each])

----------------------------
----------------------------
Seed song:
genre                                           7
artist_name                                Tchami
track_name          Move Your Body (Future House)
track_id                   5fFEgmLAbNG75zKPn3hXOO
popularity                                     46
acousticness                              0.00424
danceability                                0.818
duration_ms                                219429
energy                                       0.87
instrumentalness                            0.196
key                                             0
liveness                                   0.0429
loudness                                   -5.162
mode                                            0
speechiness                                0.0493
tempo                                     123.999
time_signature                                  0
valence                                     0.741
Name: 25045, dtype: object
----