### University of California, Santa Barbara

### PSTAT 135 / 235 - Big Data Analytics

### Prof Tashman

### PURPOSE: Build a recommendation algorithm based on user listening data from Autoscrobbler

#### NOTE: This tutorial is for illustration of code and results only. It will not run, as data is not uploaded.

In [1]:
# import modules
import os

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib import recommendation
from pyspark.mllib.recommendation import *
import pandas as pd

In [2]:
# set configurations
conf = SparkConf().setMaster("local").setAppName("autoscrobbler")

In [3]:
# set context
sc = SparkContext.getOrCreate(conf=conf)

In [4]:
# pathing and params
dir_data = '/data/'

user_artist_data_file = 'user_artist_data.txt'
artist_data_file = 'artist_data.txt'
artist_alias_data_file  = 'artist_alias.txt'

user_artist_full_path = os.path.join('data/',user_artist_data_file)
artist_full_path = os.path.join('data/',artist_data_file)
artist_alias_full_path = os.path.join('data/',artist_alias_data_file)

numPartitions = 2
topk = 10

In [5]:
# read user_artist_data_file into RDD (417MB file, 24MM records of users’ plays of artists, along with count)
# specifically, each row holds: userID, artistID, count
rawDataRDD = sc.textFile(user_artist_full_path, numPartitions)
rawDataRDD.cache()

data/user_artist_data.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [6]:
# read artist_data_file
# artist = pd.read_csv( artist_full_path )
rawArtistRDD = sc.textFile(artist_full_path, numPartitions)
rawArtistRDD.cache()

data/artist_data.txt MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0

In [7]:
# read artist_alias_data_file
rawAliasRDD = sc.textFile(artist_alias_full_path, numPartitions)
rawAliasRDD.cache()

data/artist_alias.txt MapPartitionsRDD[5] at textFile at NativeMethodAccessorImpl.java:0

In [8]:
print(rawDataRDD.take(10))

['1000002 1 55', '1000002 1000006 33', '1000002 1000007 8', '1000002 1000009 144', '1000002 1000010 314', '1000002 1000013 8', '1000002 1000014 42', '1000002 1000017 69', '1000002 1000024 329', '1000002 1000025 1']


In [9]:
print(rawArtistRDD.take(10))

['1134999\t06Crazy Life', '6821360\tPang Nakarin', '10113088\tTerfel, Bartoli- Mozart: Don', '10151459\tThe Flaming Sidebur', '6826647\tBodenstandig 3000', '10186265\tJota Quest e Ivete Sangalo', '6828986\tToto_XX (1977', '10236364\tU.S Bombs -', '1135000\tartist formaly know as Mat', '10299728\tKassierer - Musik für beide Ohren']


In [10]:
def parseArtistIdNamePair(singlePair):
   splitPair = singlePair.rsplit('\t')
   # we should have two items in the list - id and name of the artist.
   if len(splitPair) != 2:
       #print singlePair
       return []
   else:
       try:
           return [(int(splitPair[0]), splitPair[1])]
       except:
           return []


In [11]:
artistByID = dict(rawArtistRDD.flatMap(lambda x: parseArtistIdNamePair(x)).collect())

In [12]:
artist_vals = artistByID.values()

In [13]:
# print topk values from artistByID
from collections import Counter
print( Counter(artist_vals).most_common( topk ) )

[('"', 3), ('Rio Natsuki', 2), ('•', 2), ('Einojuhani Rautavaara', 2), ('Ò', 2), ('川田妙子', 2), ('The Beatnuts', 2), ('Stone Temple Pilots', 2), ('{', 2), ('º', 2)]


In [14]:
def parseArtistAlias(alias):
    splitPair = alias.rsplit('\t')
    # we should have two items in the list - id and name of the artist.
    if len(splitPair) != 2:
        #print singlePair
        return []
    else:
        try:
            return [(int(splitPair[0]), int(splitPair[1]))]
        except:
            return []

In [15]:
artistAlias = rawAliasRDD.flatMap(lambda x: parseArtistAlias(x)).collectAsMap()

In [16]:
# turn the artistAlias into a broadcast variable.
# This will distribute it to worker nodes efficiently, so we save bandwidth.
artistAliasBroadcast = sc.broadcast( artistAlias )

In [17]:
artistAliasBroadcast.value.get(2097174)

1007797

In [18]:
# Print the number of records from the largest RDD, rawDataRDD
print( rawDataRDD.count() )

24296858


In [19]:
# Sample 10% of rawDataRDD using seed 314, to reduce runtime. Call it sample.
weights = [.1, .9]
seed = 314
sample, someOtherJunk = rawDataRDD.randomSplit( weights, seed )
sample.cache()

PythonRDD[11] at RDD at PythonRDD.scala:48

In [20]:
# take the first 5 records from the sample. each row represents userID, artistID, count.
sample.take(5)

['1000002 1000014 42',
 '1000002 1000088 157',
 '1000002 1000139 56',
 '1000002 1000140 95',
 '1000002 1000210 23']

In [21]:
# Based on sampled data, build the matrix for model training
def mapSingleObservation(x):
    # Returns Rating object represented as (user, product, rating) tuple.
    # [add line of code here to split each record into userID, artistID, count]
    userID, artistID, count = map(lambda lineItem: int(lineItem), x.split())
    # given possible aliasing, get finalArtistID
    finalArtistID = artistAliasBroadcast.value.get(artistID)
    if finalArtistID is None:
        finalArtistID = artistID
    return Rating(userID, finalArtistID, count)

In [22]:
trainData = sample.map(lambda x: mapSingleObservation(x))
trainData.cache()

PythonRDD[13] at RDD at PythonRDD.scala:48

In [23]:
# Take the first 5 records from trainData
trainData.take(5)

[Rating(user=1000002, product=1000014, rating=42.0),
 Rating(user=1000002, product=1000088, rating=157.0),
 Rating(user=1000002, product=1000139, rating=56.0),
 Rating(user=1000002, product=1000140, rating=95.0),
 Rating(user=1000002, product=1000210, rating=23.0)]

**The Alternating Least Squares Algo is popular for recommendation**

Parameters

`rank`  
The number of latent factors in the model

`iterations`  
The number of iterations that the factorization runs

`lambda`  
A standard overfitting parameter. Higher values resist overfitting.


In [25]:
# Train the ALS model, using seed 314, rank 10, iterations 5, lambda_ 0.01
model = ALS.trainImplicit(trainData, rank=10, iterations=5, alpha=0.01)

In [26]:
model

<pyspark.mllib.recommendation.MatrixFactorizationModel at 0x1076ffeb8>

In [27]:
# Model Evaluation

# fetch artists for a test user
testUserID = 1000002

# broadcast artistByID for speed
artistByIDBroadcast = sc.broadcast( artistByID )

# from trainData, collect the artists for the test user. Call the object artistsForUser.
# hint: you will need to apply .value.get(x.product) to the broadcast artistByID, where x is the Rating RDD.
# if you don't do this, you may see artistIDs. you want artist names.
artistsForUser = (trainData
                  .filter(lambda observation: observation.user == testUserID)
                  .map(lambda observation: artistByIDBroadcast.value.get(observation.product))
                  .collect())

In [28]:
print(artistsForUser)

['Pantera', 'Counting Crows', 'Muse', '(hed) Planet Earth', 'Eve 6', 'Meat Loaf', "Fun Lovin' Criminals", "Guns N' Roses", 'Mike & the Mechanics', 'Fugees', 'Apollo 440', 'The Seahorses', 'Steve Miller', 'John Mayer', 'Frankie Goes To Hollywood', 'Derek and the Dominos', '[unknown]', 'Jeno Jando', 'Eva Cassidy', 'Faith No More', 'Jimi Hendrix', 'Duke Ellington and Johnny Hodg', 'Eric Clapton', 'Tom Waits', 'Tindersticks', 'Phil Collins', 'David Bowie', 'Eurythmics', 'Dream Theater', 'Kid Rock']


In [33]:
# Make 10 recommendations for test user
num_recomm = 10
recommendationsForUser = map(lambda observation: artistByID.get(observation.product), model.call("recommendProducts", testUserID, num_recomm))
print(recommendationsForUser)

<map object at 0x11fdb9978>


In [30]:
# Train a second ALS model, same as first but with rank 20
model2 = ALS.trainImplicit(trainData, rank=20, iterations=5, alpha=0.01)

In [31]:
# Using the rank 20 model, make 10 recommendations for the same test user
recommendationsForUser_rank20 = map(lambda observation: artistByID.get(observation.product), model.call("recommendProducts", testUserID, num_recomm))
print(recommendationsForUser_rank20)

<map object at 0x11fd8dac8>
