In [None]:
# I'm working on Google Drive so I mount my Drive into Colab and change the working directory to the project folder.
import os
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/AMPLab/LargeScaleDatasets/CollaborativeFiltering'
os.chdir(path)

Mounted at /content/drive


In [None]:
from scipy import sparse

import implicit
import csv
import numpy as np
import pandas as pd

In [None]:
# I'll be using the implicit library to manage the sparse matrix.
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp39-cp39-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [None]:
!head /content/drive/MyDrive/AMPLab/LargeScaleDatasets/CollaborativeFiltering/subset_artistmbid_name.csv

mbid,name
ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,The Silhouettes
b68a3969-319a-462f-942b-cd35581414fc,Evie Tamala
4c4b7c6f-9285-4d6a-bc10-e5c9e08045f8,wecamewithbrokenteeth
ab1b631b-9896-4433-bef9-7868bf8a42f3,Giant Tomo
66de1369-f9eb-43cb-ae4f-88582a47a624,Elvin Jones & Jimmy Garrison Sextet
3175ff46-8e09-4562-ace2-0e557e0fff42,Disappointment Incorporated
872a1247-c0d8-4970-b608-6752db0e532b,Stereobate
1c2d8a03-9a9f-4424-be44-61fd5634ea1d,We Barbarians
9047f366-3560-40d5-bbcc-fba04118737d,Nodesha


In [None]:
## We make a dictionary for both converting from artist_name to artist_mbid and vice versa

artist_name_2_mbid = {}
artist_mbid_2_name = {}

with open('subset_artistmbid_name.csv', 'r') as fr:
    reader = csv.DictReader(fr)
    for line in reader:
        if line['mbid'] not in artist_name_2_mbid.keys():
            artist_mbid_2_name[line['mbid']] = [line['name']]
        else:
            artist_mbid_2_name[line['mbid']].append(line['name'])

        if line['name'] not in artist_name_2_mbid.keys():
            artist_name_2_mbid[line['name']] = [line['mbid']]
        else:
            artist_name_2_mbid[line['name']].append(line['mbid'])

In [None]:
# Check that it works with an artist name.
artist_name_2_mbid['Iron & Wine'][0]

'c3f28da8-662d-4f09-bdc7-3084bf685930'

In [None]:
# Read the userID, artist_mbid, artist name and playcount file to a Pandas Dataframe.

df = pd.read_table(
    'userID_artistmbid_artistName_plays.tsv', usecols=[0, 1, 3], skiprows=1, names=['user', 'artist_mbid', 'plays'], na_filter=False
)

In [None]:
# Data is set as categorical and artist_mbid is factorized so that it can be turnt into the Sparse Matrix.

df['user'] = df['user'].astype('category')
df['artist_mbid'] = df['artist_mbid'].astype('category')
df['plays'] = df['plays'].astype(int)
df['artist_index'] = pd.factorize(df['artist_mbid'])[0]

In [None]:
# To be able to switch between artist_mbid and artist_index, both an artist_index to artist_mbid and an artist_mbid to artist_index dictionaries are created.

artist_index2artist_mbid = df.set_index('artist_index')['artist_mbid'].to_dict()

artist_mbid_2_index = {}

for key, value in artist_index2artist_mbid.items():
    artist_mbid_2_index[value] = key

In [None]:
# We check again that everything is working

artist_mbid_2_name[artist_index2artist_mbid[6245]]

['Iron & Wine']

In [None]:
artist_mbid_2_index[artist_name_2_mbid['Iron & Wine'][0]]

6245

In [None]:
userid_2_index = {user: index for index, user in enumerate(set(df['user']))}
user_indexes = np.array([userid_2_index[user] for user in df['user']])
artist_indexes = df['artist_index']

In [None]:
# create a sparse matrix of all the users/plays
from scipy.sparse import coo_matrix, csr_matrix
plays = coo_matrix(
    (
        df['plays'].astype(np.float32),
        (artist_indexes, user_indexes)
    )
)

In [None]:
from implicit.nearest_neighbours import bm25_weight

artist_user_plays = bm25_weight(plays, K1=100, B=0.8)
user_plays = artist_user_plays.T.tocsr()

In [None]:
# Train the model with alternating least squares of the different users.

from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(factors=1000, regularization=0.05, alpha=2.0)
model.fit(user_plays)

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# Check for a known artist
id = artist_mbid_2_index[artist_name_2_mbid['Iron & Wine'][0]]
ids, scores= model.similar_items(id)

# Display the results using pandas for nicer formatting
recom = pd.DataFrame({"artist": ids, "score": scores})
recom['artist_name'] = 0 * recom['artist']

for i in range(len(recom['artist'])):
    recom['artist_name'].iloc[i] = artist_mbid_2_name[artist_index2artist_mbid[recom['artist'].iloc[i]]][0]

recom = recom.drop(['artist'], axis=1)
recom

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recom['artist_name'].iloc[i] = artist_mbid_2_name[artist_index2artist_mbid[recom['artist'].iloc[i]]][0]


Unnamed: 0,score,artist_name
0,1.0,Iron & Wine
1,0.422283,Gregory Alan Isakov
2,0.390881,Fleet Foxes
3,0.383695,The Shins
4,0.376312,Bon Iver
5,0.375864,José González
6,0.369226,Sufjan Stevens
7,0.365435,Lord Huron
8,0.360813,Death Cab for Cutie
9,0.341159,The Head and the Heart


In [None]:
# Check for Masayoshi Takanaka (高中正義)
id = artist_mbid_2_index[artist_name_2_mbid['高中正義'][0]]
ids, scores= model.similar_items(id)

# Display the results using pandas for nicer formatting
recom = pd.DataFrame({"artist": ids, "score": scores})
recom['artist_name'] = 0 * recom['artist']

for i in range(len(recom['artist'])):
    recom['artist_name'].iloc[i] = artist_mbid_2_name[artist_index2artist_mbid[recom['artist'].iloc[i]]][0]

recom = recom.drop(['artist'], axis=1)
recom

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recom['artist_name'].iloc[i] = artist_mbid_2_name[artist_index2artist_mbid[recom['artist'].iloc[i]]][0]


Unnamed: 0,score,artist_name
0,1.0,高中正義
1,0.38614,Casiopea
2,0.328007,大貫妙子
3,0.326699,T‐SQUARE
4,0.301526,杏里
5,0.278965,Lamp
6,0.277665,Naniwa Express
7,0.277088,Piper
8,0.269268,Serge Ponsar
9,0.268987,TUCKER


In [None]:
# Check for The Band
id = artist_mbid_2_index[artist_name_2_mbid['The Band'][0]]
ids, scores= model.similar_items(id)

# Display the results using pandas for nicer formatting
recom = pd.DataFrame({"artist": ids, "score": scores})
recom['artist_name'] = 0 * recom['artist']

for i in range(len(recom['artist'])):
    recom['artist_name'].iloc[i] = artist_mbid_2_name[artist_index2artist_mbid[recom['artist'].iloc[i]]][0]

recom = recom.drop(['artist'], axis=1)
recom

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recom['artist_name'].iloc[i] = artist_mbid_2_name[artist_index2artist_mbid[recom['artist'].iloc[i]]][0]


Unnamed: 0,score,artist_name
0,1.0,The Band
1,0.518695,Van Morrison
2,0.494562,Crosby
3,0.464024,Bob Dylan
4,0.462077,Crosby
5,0.435122,Grateful Dead
6,0.42556,George Harrison
7,0.420686,Neil Young
8,0.418821,The Byrds
9,0.413197,Warren Zevon
