In [3]:
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col,mean,max,min

In [6]:
spark = SparkSession.builder.appName("rec").config("spark.executor.memory", "5g").config("spark.driver.memory", "5g").master("local[*]").getOrCreate()

In [7]:
artists = spark.read.csv('artist_data.txt',inferSchema=True,sep='\t')

In [8]:
artists = artists.select(col("_c0").alias("artist_id"), col("_c1").alias("artist_name"))

In [9]:
aliases = spark.read.csv('artist_alias.txt',inferSchema=True,sep='\t')

In [10]:
aliases = aliases.select(col("_c0").alias("wrong_id"), col("_c1").alias("correct_id"))

In [11]:
listen_counts = spark.read.csv('user_artist_data.txt',inferSchema=True,sep=' ')

In [12]:
listen_counts = listen_counts.select(col("_c0").alias("user_id"), col("_c1").alias("artist_id"), col('_c2').alias('counts'))

In [13]:
listen_counts.select(mean('counts'),max('counts'),min('counts')).show()

+-----------------+-----------+-----------+
|      avg(counts)|max(counts)|min(counts)|
+-----------------+-----------+-----------+
|15.29576248089362|     439771|          1|
+-----------------+-----------+-----------+



In [14]:
artists.count()

1848671

In [15]:
correct_artists = artists.join(aliases,artists.artist_id==aliases.correct_id,how='left').filter(col('wrong_id').isNotNull()).select('correct_id','wrong_id',col('artist_name').alias('true_name'))

In [16]:
correct_artists.head(10)

[Row(correct_id=1240105, wrong_id=2078895, true_name='André Visior'),
 Row(correct_id=1240105, wrong_id=1081871, true_name='André Visior'),
 Row(correct_id=6950366, wrong_id=2009391, true_name='the.5th.element'),
 Row(correct_id=1030848, wrong_id=6664739, true_name="Raver's Nature"),
 Row(correct_id=1030848, wrong_id=6802865, true_name="Raver's Nature"),
 Row(correct_id=1030848, wrong_id=1107458, true_name="Raver's Nature"),
 Row(correct_id=6671603, wrong_id=10052273, true_name='サエキトモ'),
 Row(correct_id=1147788, wrong_id=1106359, true_name='Mog Stunt Team'),
 Row(correct_id=1240185, wrong_id=1287345, true_name='Lexy & K. Paul'),
 Row(correct_id=1240185, wrong_id=1236792, true_name='Lexy & K. Paul')]

In [17]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
wrong_to_correct = udf(lambda artist_name,true_name: artist_name if true_name == None else true_name, StringType())

In [18]:
temp = artists.join(correct_artists,artists.artist_id==correct_artists.wrong_id,how='left')

In [19]:
artists = temp.withColumn('name',wrong_to_correct(temp.artist_name,temp.true_name)).select('artist_id','name')

In [20]:
artists.head(10)

[Row(artist_id='148', name='P.O.D.'),
 Row(artist_id='463', name='The Smiths'),
 Row(artist_id='471', name='The Beta Band'),
 Row(artist_id='496', name='DJ Hell'),
 Row(artist_id='833', name='Otis Redding'),
 Row(artist_id='1088', name='Cash Money & Marvelous'),
 Row(artist_id='1342', name='The Peechees'),
 Row(artist_id='1580', name='Bang The Party'),
 Row(artist_id='1645', name='Oz Artists'),
 Row(artist_id='1829', name='Agent Provocateur')]

In [21]:
from pyspark.mllib.recommendation import ALS

In [22]:
model = ALS.trainImplicit(listen_counts,10)

In [58]:
listen_counts.filter(col('user_id')==1000002).withColumnRenamed('artist_id','left_artist_id').join(artists,col('left_artist_id')==artists.artist_id).show()

+-------+--------------+------+---------+--------------------+
|user_id|left_artist_id|counts|artist_id|                name|
+-------+--------------+------+---------+--------------------+
|1000002|           833|     5|      833|        Otis Redding|
|1000002|       1001129|    23|  1001129|Hootie & the Blow...|
|1000002|       1003373|     3|  1003373|        Thom Rotella|
|1000002|       1007972|     1|  1007972|     Archers of Loaf|
|1000002|       1029443|    18|  1029443|Meanwhile, Back I...|
|1000002|       1076507|     1|  1076507|Run-D.M.C. vs. Ja...|
|1000002|       1318111|    23|  1318111|Joni Mitchell & J...|
|1000002|       1001307|     1|  1001307|           Eiffel 65|
|1000002|       1003926|    35|  1003926|     Collective Soul|
|1000002|          1270|    26|     1270|               Queen|
|1000002|          1303|     5|     1303|           The Kinks|
|1000002|       1001410|     5|  1001410|           [unknown]|
|1000002|       1235281|     6|  1235281|             T

In [59]:
for rec in model.recommendProducts(1000002,5):
    artists.filter(col('artist_id')== rec[1]).show()

+---------+-----------+
|artist_id|       name|
+---------+-----------+
|  1000113|The Beatles|
+---------+-----------+

+---------+-----+
|artist_id| name|
+---------+-----+
|     1270|Queen|
+---------+-----+

+---------+----------+
|artist_id|      name|
+---------+----------+
|       82|Pink Floyd|
+---------+----------+

+---------+------------+
|artist_id|        name|
+---------+------------+
|     1394|Led Zeppelin|
+---------+------------+

+---------+----+
|artist_id|name|
+---------+----+
|     1205|  U2|
+---------+----+



In [40]:
model.recommendProducts(2093760,5)

[Rating(user=2093760, product=1007614, rating=0.03323218874594814),
 Rating(user=2093760, product=4605, rating=0.030922344161561355),
 Rating(user=2093760, product=2814, rating=0.03076250545533085),
 Rating(user=2093760, product=829, rating=0.029528970564163405),
 Rating(user=2093760, product=1037970, rating=0.02865586575534429)]

In [56]:
listen_counts.filter(col('user_id')!=100000).head(10)

[Row(user_id=1000019, artist_id=1000010, counts=11),
 Row(user_id=1000019, artist_id=1000028, counts=5),
 Row(user_id=1000019, artist_id=1000033, counts=2),
 Row(user_id=1000019, artist_id=1000036, counts=5),
 Row(user_id=1000019, artist_id=1000054, counts=1),
 Row(user_id=1000019, artist_id=1000055, counts=1),
 Row(user_id=1000019, artist_id=1000062, counts=1),
 Row(user_id=1000019, artist_id=1000069, counts=3),
 Row(user_id=1000019, artist_id=1000071, counts=2),
 Row(user_id=1000019, artist_id=1000076, counts=10)]