In [1]:
import pyspark.conf
import pyspark.sql
SparkConf = pyspark.conf.SparkConf
SparkSession = pyspark.sql.SparkSession
spark = SparkSession.builder \
            .appName("Intro") \
            .config('spark.executor.memory', '2g') \
            .config('spark.driver.memory','4g') \
            .config("spark.sql.crossJoin.enabled", "true")\
            .getOrCreate()

In [2]:
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.types import IntegerType

In [3]:
from pyspark.ml.recommendation import ALS
import random

In [4]:
raw_user_artist_path = "data/audioscrobbler_data/user_artist_data.txt"
raw_user_artist_data = spark.read.text(raw_user_artist_path)

In [5]:
# User / Artist / Count
raw_user_artist_data.show(5)

+-------------------+
|              value|
+-------------------+
|       1000002 1 55|
| 1000002 1000006 33|
|  1000002 1000007 8|
|1000002 1000009 144|
|1000002 1000010 314|
+-------------------+
only showing top 5 rows



In [6]:
# Artist_ID / Name
raw_artist_data = spark.read.text("data/audioscrobbler_data/artist_data.txt")
raw_artist_data.show(5)

+--------------------+
|               value|
+--------------------+
|1134999\t06Crazy ...|
|6821360\tPang Nak...|
|10113088\tTerfel,...|
|10151459\tThe Fla...|
|6826647\tBodensta...|
+--------------------+
only showing top 5 rows



In [7]:
#Alias
raw_artist_alias = spark.read.text("data/audioscrobbler_data/artist_alias.txt")
raw_artist_alias.show(5)

+-----------------+
|            value|
+-----------------+
| 1092764\t1000311|
| 1095122\t1000557|
| 6708070\t1007267|
|10088054\t1042317|
| 1195917\t1042317|
+-----------------+
only showing top 5 rows



Collaborative filtering is a technique that can filter out items that a user might like on the basis of reactions by similar users. It works by searching a large group of people and finding a smaller set of users with tastes similar to a particular user. 
Deciding that two users might both like the same song because they play many other songs that are the same is
an example.

### Alternating Least Squares Algorithm
For further information:
http://yifanhu.net/PUB/cf.pdf and 
https://dl.acm.org/doi/10.1007/978-3-540-68880-8_32

In [9]:
from pyspark.sql.functions import split, min, max
from pyspark.sql.types import IntegerType, StringType
# Min and Max user and artist ID
user_artist_df = raw_user_artist_data.withColumn('user',
                                    split(raw_user_artist_data['value'], ' ').\
                                    getItem(0).\
                                    cast(IntegerType()))
user_artist_df = user_artist_df.withColumn('artist',
                                    split(raw_user_artist_data['value'], ' ').\
                                    getItem(1).\
                                    cast(IntegerType()))
user_artist_df = user_artist_df.withColumn('count',
                                    split(raw_user_artist_data['value'], ' ').\
                                    getItem(2).\
                                    cast(IntegerType())).drop('value')
user_artist_df.select([min("user"), max("user"), min("artist"),\
                                    max("artist")]).show()

+---------+---------+-----------+-----------+
|min(user)|max(user)|min(artist)|max(artist)|
+---------+---------+-----------+-----------+
|       90|  2443548|          1|   10794401|
+---------+---------+-----------+-----------+



In [12]:
from pyspark.sql.functions import col
artist_by_id = raw_artist_data.withColumn('id', split(col('value'), '\s+', 2).\
                                            getItem(0).\
                                            cast(IntegerType()))
artist_by_id = artist_by_id.withColumn('name', split(col('value'), '\s+', 2).\
                                            getItem(1).\
                                            cast(StringType())).drop('value')
artist_by_id.show(5)

+--------+--------------------+
|      id|                name|
+--------+--------------------+
| 1134999|        06Crazy Life|
| 6821360|        Pang Nakarin|
|10113088|Terfel, Bartoli- ...|
|10151459| The Flaming Sidebur|
| 6826647|   Bodenstandig 3000|
+--------+--------------------+
only showing top 5 rows



In [13]:
artist_alias = raw_artist_alias.withColumn('artist',
                                            split(col('value'), '\s+').\
                                            getItem(0).\
                                            cast(IntegerType())).\
                                withColumn('alias',
                                            split(col('value'), '\s+').\
                                            getItem(1).\
                                            cast(StringType())).\
                                            drop('value')
artist_alias.show(5)

+--------+-------+
|  artist|  alias|
+--------+-------+
| 1092764|1000311|
| 1095122|1000557|
| 6708070|1007267|
|10088054|1042317|
| 1195917|1042317|
+--------+-------+
only showing top 5 rows



In [14]:
artist_by_id.filter(artist_by_id.id.isin(1092764, 1000311)).show()

+-------+--------------+
|     id|          name|
+-------+--------------+
|1000311| Steve Winwood|
|1092764|Winwood, Steve|
+-------+--------------+



In [18]:
from pyspark.sql.functions import broadcast, when
train_data = user_artist_df.join(broadcast(artist_alias),'artist', how='left')
train_data = train_data.withColumn('artist',when(col('alias').isNull(), col('artist')).otherwise(col('alias')))
train_data = train_data.withColumn('artist', col('artist').cast(IntegerType())).drop('alias')
train_data.cache()
train_data.count()

24296858

In [19]:
from pyspark.ml.recommendation import ALS
model = ALS(rank=10, seed=0, maxIter=5, regParam=0.1,
implicitPrefs=True, alpha=1.0, userCol='user',
itemCol='artist', ratingCol='count').fit(train_data)

In [20]:
model.userFactors.show(1, truncate = False)

+---+-----------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                     |
+---+-----------------------------------------------------------------------------------------------------------------------------+
|90 |[0.16020626, 0.20717518, -0.17194685, 0.060384676, 0.0627277, 0.54658705, -0.40481892, 0.43657345, -0.10396776, -0.042728294]|
+---+-----------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



## Spot Checking

In [23]:
user_id = 2093760
existing_artist_ids = train_data.filter(train_data.user == user_id).select("artist").collect()
existing_artist_ids = [i[0] for i in existing_artist_ids]
artist_by_id.filter(col('id').isin(existing_artist_ids)).show()

+-------+---------------+
|     id|           name|
+-------+---------------+
|   1180|     David Gray|
|    378|  Blackalicious|
|    813|     Jurassic 5|
|1255340|The Saw Doctors|
|    942|         Xzibit|
+-------+---------------+



In [24]:
user_subset = train_data.select('user').where(col('user') == user_id).distinct()
top_predictions = model.recommendForUserSubset(user_subset, 5)
top_predictions.show()

+-------+--------------------+
|   user|     recommendations|
+-------+--------------------+
|2093760|[{2814, 0.0294106...|
+-------+--------------------+



In [25]:
user_subset = train_data.select('user').where(col('user') == user_id).distinct()
top_predictions = model.recommendForUserSubset(user_subset, 5)
top_predictions.show()

+-------+--------------------+
|   user|     recommendations|
+-------+--------------------+
|2093760|[{2814, 0.0294106...|
+-------+--------------------+



In [28]:
top_predictions_pandas = top_predictions.toPandas()
print(top_predictions_pandas)
recommended_artist_ids = [i[0] for i in top_predictions_pandas.\
recommendations[0]]
artist_by_id.filter(col('id').isin(recommended_artist_ids)).show()

      user                                    recommendations
0  2093760  [(2814, 0.029410677030682564), (1300642, 0.028...
+-------+----------+
|     id|      name|
+-------+----------+
|   2814|   50 Cent|
|   4605|Snoop Dogg|
|1007614|     Jay-Z|
|1001819|      2Pac|
|1300642|  The Game|
+-------+----------+

