# Vector Based Song Recocomendation

## Initializing Qdrant Client

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

In [2]:
client = QdrantClient("localhost", port=6333)

In [3]:
client.recreate_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=4, distance=Distance.COSINE),
)

True

In [4]:
print(client.get_collection(collection_name="test_collection"))

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=0 indexed_vectors_count=0 points_count=0 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=4, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None) payload_schema={}


In [5]:
# Qdrant is up and running!

## Getting the Spotify Data

In [6]:
import pandas as pd

In [7]:
# df = pd.read_csv("./data_folder/data.csv")
df = pd.read_csv("./data_folder/data.csv", index_col=0)

In [8]:
df.shape

(2017, 16)

In [9]:
df.columns

Index(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence', 'target',
       'song_title', 'artist'],
      dtype='object')

In [10]:
df.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target
count,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0
mean,0.18759,0.618422,246306.2,0.681577,0.133286,5.342588,0.190844,-7.085624,0.612295,0.092664,121.603272,3.96827,0.496815,0.505702
std,0.259989,0.161029,81981.81,0.210273,0.273162,3.64824,0.155453,3.761684,0.487347,0.089931,26.685604,0.255853,0.247195,0.500091
min,3e-06,0.122,16042.0,0.0148,0.0,0.0,0.0188,-33.097,0.0,0.0231,47.859,1.0,0.0348,0.0
25%,0.00963,0.514,200015.0,0.563,0.0,2.0,0.0923,-8.394,0.0,0.0375,100.189,4.0,0.295,0.0
50%,0.0633,0.631,229261.0,0.715,7.6e-05,6.0,0.127,-6.248,1.0,0.0549,121.427,4.0,0.492,1.0
75%,0.265,0.738,270333.0,0.846,0.054,9.0,0.247,-4.746,1.0,0.108,137.849,4.0,0.691,1.0
max,0.995,0.984,1004627.0,0.998,0.976,11.0,0.969,-0.307,1.0,0.816,219.331,5.0,0.992,1.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2017 entries, 0 to 2016
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      2017 non-null   float64
 1   danceability      2017 non-null   float64
 2   duration_ms       2017 non-null   int64  
 3   energy            2017 non-null   float64
 4   instrumentalness  2017 non-null   float64
 5   key               2017 non-null   int64  
 6   liveness          2017 non-null   float64
 7   loudness          2017 non-null   float64
 8   mode              2017 non-null   int64  
 9   speechiness       2017 non-null   float64
 10  tempo             2017 non-null   float64
 11  time_signature    2017 non-null   float64
 12  valence           2017 non-null   float64
 13  target            2017 non-null   int64  
 14  song_title        2017 non-null   object 
 15  artist            2017 non-null   object 
dtypes: float64(10), int64(4), object(2)
memory usag

In [12]:
df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,1,Mask Off,Future
1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588,1,Redbone,Childish Gambino
2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173,1,Xanny Family,Future
3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23,1,Master Of None,Beach House
4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904,1,Parallel Lines,Junior Boys


## Vectorizing the data

In [13]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [14]:
numeric_cols = ['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence']
non_numeric_cols = ['song_title', 'artist']

In [15]:
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [16]:
df.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target
count,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0
mean,0.18853,0.575895,0.232923,0.67817,0.136563,0.48569,0.181061,0.793272,0.612295,0.087734,0.430066,0.742067,0.482673,0.505702
std,0.261296,0.186809,0.082928,0.213866,0.279879,0.331658,0.1636,0.11472,0.487347,0.113421,0.155627,0.063963,0.258249,0.500091
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.009676,0.454756,0.186097,0.557567,0.0,0.181818,0.077352,0.75337,0.0,0.018161,0.305181,0.75,0.271835,0.0
50%,0.063615,0.590487,0.215681,0.712164,7.8e-05,0.545455,0.113871,0.818817,1.0,0.040106,0.429038,0.75,0.477643,1.0
75%,0.26633,0.714617,0.257227,0.845403,0.055328,0.818182,0.24016,0.864623,1.0,0.107075,0.524809,0.75,0.685541,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Upserting the data vectors into the Collection

In [17]:
from qdrant_client.http import models
from qdrant_client.http.models import PointStruct

We convert each row (each song) into a vector where the vector is made up of the numeric columns \
and the payload is the artist and the song name.

In [18]:
points = []

In [19]:
for row in df.iterrows():
    temp = PointStruct(id = row[0], 
                       vector = list(map(float, row[1][:-2].tolist())),
                       payload={
                           "artist" :  row[1][-1],
                           "song_title" : row[1][-2],
                       }
    )
    points.append(temp)

In [20]:
points[:3]

[PointStruct(id=0, vector=[0.010248431261854055, 0.8248259860788862, 0.1907352427965223, 0.4263628966639544, 0.022438524590163933, 0.18181818181818182, 0.1538623447695222, 0.7411405916437938, 1.0, 0.5144406608651785, 0.596033171596529, 0.75, 0.2624320936063519, 1.0], payload={'artist': 'Future', 'song_title': 'Mask Off'}),
 PointStruct(id=1, vector=[0.19999771657639706, 0.7204176334106729, 0.31448079831273995, 0.35008136696501213, 0.006260245901639344, 0.09090909090909091, 0.12439486423910757, 0.6921622445867641, 1.0, 0.07100517089166353, 0.6544741998693665, 0.75, 0.5779356456330965, 1.0], payload={'artist': 'Childish Gambino', 'song_title': 'Redbone'}),
 PointStruct(id=2, vector=[0.034570108722722384, 0.8306264501160092, 0.17162408897565712, 0.40398698128559796, 0.0002397540983606557, 0.18181818181818182, 0.14754788465586194, 0.7913693199146081, 1.0, 0.33535124227519236, 0.1585390034524587, 0.75, 0.14437944003343084, 1.0], payload={'artist': 'Future', 'song_title': 'Xanny Family'})]

In [21]:
print(len(points))
print(len(points[0].vector))

2017
14


In [22]:
client.recreate_collection(
    collection_name="spotify_collection",
    vectors_config = VectorParams(size = 14, distance=models.Distance.COSINE)
)

True

In [23]:
operation_info = client.upsert(
    collection_name="spotify_collection",
    wait=True,
    points = points
)

Data has been inserted into the Collection!

## Searching

In [24]:
# Helper function to display only the Song details and Score in a decent format
def print_results(results):
    for result in results:
        print("(Song: {}, Artist: {}), Score = {}".format(result.payload['song_title'], result.payload['artist'], result.score))

In [25]:
FaceoffVector = [0.010248431261854055, 0.8248259860788862, 0.1907352427965223, 0.4263628966639544, 0.022438524590163933, 0.18181818181818182, 0.1538623447695222, 0.7411405916437938, 1.0, 0.5144406608651785, 0.596033171596529, 0.75, 0.2624320936063519, 1.0]

In [26]:
# Searching with some vectors
# I have used vector of the song Mask Off
search_result = client.search(
    collection_name="spotify_collection",
    query_vector=FaceoffVector,
    limit=3
)
print_results(search_result)

(Song: Mask Off, Artist: Future), Score = 1.0
(Song: Smell Yo D*ck, Artist: Riskay), Score = 0.98770213
(Song: River, Artist: Ibeyi), Score = 0.9871838


In [27]:
# As we can see, these are the top 3 songs with a similar vector to our input vector.
# We can increase limit to show more vectors.

In [28]:
# Making a filter to show only the songs that have artist = Childish Gambino
childishGambino = models.Filter(
    must = [
        models.FieldCondition(
            key = "artist",
            match = models.MatchValue(value = "Childish Gambino")
        )
    ]
)
childishGambino

Filter(should=None, must=[FieldCondition(key='artist', match=MatchValue(value='Childish Gambino'), range=None, geo_bounding_box=None, geo_radius=None, values_count=None)], must_not=None)

In [29]:
search_result = client.search(
    collection_name="spotify_collection",
    query_vector=FaceoffVector,
    limit=10,
    query_filter=childishGambino,
)
print_results(search_result)

(Song: Redbone, Artist: Childish Gambino), Score = 0.959617
(Song: III. Telegraph Ave. ("Oakland" by Lloyd), Artist: Childish Gambino), Score = 0.8333115


In [30]:
# As we can see, it finds the songs closets to the input that have artist = "childish Gambino"
# Note: There are only two songs by Childish Gambino in this dataset, which is why its only showing two results.

# Recommendation Engine

Now, we can use the client.recommend() function that takes a query vector as well as preferences (positive and negative) in order to find the optimal song for us!

Lets say for example,\
My liked songs: 157, 421, 662\
My disliked songs: 5, 6, 37, 117.

In [33]:
search_result = client.recommend(
    collection_name="spotify_collection",
    positive=[157, 421, 662],
    negative=[5, 6, 37, 117], 
)
print_results(search_result)

(Song: I've Seen Footage, Artist: Death Grips), Score = 0.95813453
(Song: Monster, Artist: Kanye West), Score = 0.95658845
(Song: I Know There's Gonna Be (Good Times), Artist: Jamie xx), Score = 0.95479816
(Song: Auditorium, Artist: Mos Def), Score = 0.9527035
(Song: Capitol, Artist: TR/ST), Score = 0.9485413
(Song: Oh My Darling Don't Meow (Just Blaze Remix), Artist: Run The Jewels), Score = 0.94691026
(Song: Mary Jane (Jamie Xx - Girl Remix), Artist: Jamie xx), Score = 0.946759
(Song: Nightcall, Artist: Kavinsky), Score = 0.94642156
(Song: I Know There's Gonna Be (Good Times), Artist: Jamie xx), Score = 0.9458123
(Song: Lose My Mind, Artist: A-Trak), Score = 0.9424009


In [73]:
# Using the recommend function alongside the filter for Backstreet Boys in whitelist
search_result = client.recommend(
    collection_name="spotify_collection",
    positive=[157, 421, 662],
    negative=[5, 6, 37, 117], 
    query_filter= models.Filter(
        must = [
            models.FieldCondition(
            key = "artist",
            match = models.MatchValue(value = "Backstreet Boys")
        )
    ]
    ),
    limit=3
)
print_results(search_result)

(Song: Everybody (Backstreet's Back) - Radio Edit, Artist: Backstreet Boys), Score = 0.80196214
(Song: The Call, Artist: Backstreet Boys), Score = 0.7970424
(Song: I Want It That Way, Artist: Backstreet Boys), Score = 0.7836303


In [72]:
# Using the recommend function alongside the filter for Backstreet Boys in blacklist
search_result = client.recommend(
    collection_name="spotify_collection",
    positive=[157, 421, 662],
    negative=[5, 6, 37, 117], 
    query_filter= models.Filter(
        must_not = [
            models.FieldCondition(
            key = "artist",
            match = models.MatchValue(value = "Backstreet Boys")
        )
    ]
    ),
    limit=3
)
print_results(search_result)

(Song: I've Seen Footage, Artist: Death Grips), Score = 0.95813453
(Song: Monster, Artist: Kanye West), Score = 0.95658845
(Song: I Know There's Gonna Be (Good Times), Artist: Jamie xx), Score = 0.95479816
