# Music Artist Recommender System using Apache Spark and Python

In [23]:
!pip install pyspark



In [24]:
from pyspark.mllib.recommendation import *
from operator import *
import random   #for randomsplit

In [25]:
from pyspark import SparkContext
sc =SparkContext()

In [26]:
artist_data=sc.textFile("artist_data_small.txt") #Load datasets into rdd
artist_alias=sc.textFile("artist_alias_small.txt")
user_artist_userdata=sc.textFile("user_artist_data_small.txt")
artist_dict={}

def splitbyspace(x):
    data = x.split(' ')
    tuple = (int(data[0]),artist_dict.get(int(data[1]),int(data[1])),int(data[2]))
    return tuple

def splitbytab(x):       #used for 2 files artist_data and artist_alias
    data=x.split("\t")
    if data[1].isnumeric()==True:   #for artist_alias
        tuple=(int(data[0]),int(data[1]))
        artist_dict[int(data[0])]=int(data[1])
    else:                           #for artist_data
        tuple=(int(data[0]),data[1])
    return tuple


artist_data=artist_data.map(splitbytab) #map all rows and store
artist_alias=artist_alias.map(splitbytab)
user_artist_userdata=user_artist_userdata.map(splitbyspace)

In [27]:
train_data, validation_data, test_data = user_artist_userdata.randomSplit([40,40,20],13)  #under pyspark lib 13-seed initial number to start the split
train_data.cache()
test_data.cache()
validation_data.cache()


PythonRDD[8] at RDD at PythonRDD.scala:53

In [29]:
def mod(data):
    return data.map(lambda x:(x[0],x[1])).groupByKey().map(lambda x:(x[0],list(x[1]))).collect()    #collects all users and artists 

def modelEval(model, data):     #Model Evaluation by fine tuning parameters
    trainDataMod=mod(train_data)
    trainDataDict={}
    for user,artist in trainDataMod:
        trainDataDict[user]=set(artist) #maps user with set of artists in training set
    
    train_artists= train_data.map(lambda x:x[1]).collect() #list of all artists in training set
    
    predDataMod=mod(data) # for validation set/testing set
    predDataDict={}
    for user,artist in predDataMod:
        predDataDict[user]=set(artist)  #maps user with set of artists in validation/testing set
    
    score=0.0   #calculate hit ratio
    
    for user in predDataDict.keys():    # to calculate hit ratio for each user
        trueArtists = set(train_artists) - trainDataDict[user] #set of artists not in this user's liked artists list
        X = len(predDataDict[user] - trainDataDict[user])   #obtain artists actually liked by user
        uArtists = [(user,artist) for artist in trueArtists]
        uArtists = sc.parallelize(uArtists)
        
        prediction = model.predictAll(uArtists) #predict artists for this user
        prediction = prediction.sortBy(lambda y:y[2], ascending = False).map(lambda y:y[1]).take(X)
        common = set(prediction).intersection(predDataDict[user]) #list of artists predicted as liked by user
        score += len(common)/float(X)
    
    score = score/float(len(predDataDict))  #return average of hit ratios
     
    return score

    
def buildModel(rank):
    return ALS.trainImplicit(train_data, rank, seed=345)

for i in [1,2,10,11,20,21]: #determine best rank parameter by training for each
    print ("The model score for rank " + str(i) + " is " + str(modelEval(buildModel(i),validation_data)))

The model score for rank 1 is 0.07625230985976236
The model score for rank 2 is 0.08616827592156559
The model score for rank 10 is 0.09441971719854263
The model score for rank 11 is 0.0877200294686427
The model score for rank 20 is 0.08408995233356337
The model score for rank 21 is 0.08181984026379828


In [30]:
model1 = ALS.trainImplicit(train_data, rank=10, seed=345) #setting the best paramater value and train the model
modelEval(model1, test_data)    #score for test set

0.0638164944176245

In [32]:
recommend = model1.recommendProducts(1059245, 5)    #function in pyspark.mllib.recommedation library
artist_list = map(lambda x:x[1], recommend) #returning artist name
flag=1
for artist in artist_list:
    for (i,j) in artist_data.collect():
        if i == artist:
            print ("Artist " + str(flag) + ": " + str(j))
            flag+=1
            break



Artist 1: Franz Ferdinand
Artist 2: Syrup16g
Artist 3: Rage Against the Machine
Artist 4: Pet Shop Boys
Artist 5: ゆらゆら帝国


In [None]:
sc.stop()