# 1. LAST-FM: Basic ALS Recommender

This notebook implements a simple ALS recommender based on the LastFM user listening dataset. It uses spark and is written in Scala. 

## 1.1 Imports and set up 

Key libraries are imported, the spark session is initialised and the listening data is loaded in. 

In [14]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.types._

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.types._


In [2]:
val spark:SparkSession = SparkSession.builder()
      .master("local[1]")
      .appName("lastfm")
      .getOrCreate() 

22/07/24 20:00:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@3442da76


In [3]:
var data_path:String = "../resources/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv"

data_path: String = ../resources/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv


In [4]:
val schema = new StructType()
            .add("user_id", StringType, true)
            .add("timestamp", StringType, true)
            .add("artist_id", StringType, true)
            .add("artist_name", StringType, true)
            .add("track_id", StringType, true)
            .add("track_name", StringType, true)

schema: org.apache.spark.sql.types.StructType = StructType(StructField(user_id,StringType,true),StructField(timestamp,StringType,true),StructField(artist_id,StringType,true),StructField(artist_name,StringType,true),StructField(track_id,StringType,true),StructField(track_name,StringType,true))


In [5]:
val listener_data = spark.read.option("header", false).schema(schema).option("sep", "\t").csv(data_path)
listener_data.show()

+-----------+--------------------+--------------------+---------------+--------------------+--------------------+
|    user_id|           timestamp|           artist_id|    artist_name|            track_id|          track_name|
+-----------+--------------------+--------------------+---------------+--------------------+--------------------+
|user_000001|2009-05-04T23:08:57Z|f1b1cf71-bd35-4e9...|      Deep Dish|                null|Fuck Me Im Famous...|
|user_000001|2009-05-04T13:54:10Z|a7f7df4a-77d8-4f1...|       坂本龍一|                null|Composition 0919 ...|
|user_000001|2009-05-04T13:52:04Z|a7f7df4a-77d8-4f1...|       坂本龍一|                null|Mc2 (Live_2009_4_15)|
|user_000001|2009-05-04T13:42:52Z|a7f7df4a-77d8-4f1...|       坂本龍一|                null|Hibari (Live_2009...|
|user_000001|2009-05-04T13:42:11Z|a7f7df4a-77d8-4f1...|       坂本龍一|                null|Mc1 (Live_2009_4_15)|
|user_000001|2009-05-04T13:38:31Z|a7f7df4a-77d8-4f1...|       坂本龍一|                null|To Stanford (Liv

listener_data: org.apache.spark.sql.DataFrame = [user_id: string, timestamp: string ... 4 more fields]


In [6]:
val df = listener_data.drop("timestamp").na.drop()
df.show()

+-----------+--------------------+----------------+--------------------+--------------------+
|    user_id|           artist_id|     artist_name|            track_id|          track_name|
+-----------+--------------------+----------------+--------------------+--------------------+
|user_000001|a7f7df4a-77d8-4f1...|        坂本龍一|f7c1f8f8-b935-45e...|The Last Emperor ...|
|user_000001|a7f7df4a-77d8-4f1...|        坂本龍一|475d4e50-cebb-4cd...|Tibetan Dance (Ve...|
|user_000001|ba2f4f3b-0293-4bc...|      Underworld|dc394163-2b78-4b5...|Boy, Boy, Boy (Sw...|
|user_000001|ba2f4f3b-0293-4bc...|      Underworld|340d9a0b-9a43-409...|Crocodile (Innerv...|
|user_000001|a16e47f5-aa54-47f...| Ennio Morricone|0b04407b-f517-4e0...|Ninna Nanna In Bl...|
|user_000001|463a94f1-2713-40b...|         Minus 8|4e78efc4-e545-47a...|      Elysian Fields|
|user_000001|ad0811ea-e213-451...|       Beanfield|fb51d2c4-cc69-412...|  Planetary Deadlock|
|user_000001|309e2dfc-678e-4d0...|        Dj Linus|4277434f-e3c2-41a

df: org.apache.spark.sql.DataFrame = [user_id: string, artist_id: string ... 3 more fields]


In [18]:
val df_agg = df.select("user_id", "track_id")
            .groupBy("user_id", "track_id")
            .agg(count("*")alias("count")).orderBy("user_id")
val df_agg_filtered = df_agg.limit(20000)
df_agg_filtered.show()

22/07/24 20:12:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:12:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:12:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:12:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:12:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
+-----------+--------------------+-----+
|    user_id|            track_id|count|
+-----------+--------------------+-----+
|user_000001|0025055f-39c3-43e...|    2|
|user_000001|6f71793f-5de1-499...|    7|
|user_000001|6fa928ac-af4f-4f1...|    4|
|user_000001|002e254d-4624-49f...|    3|
|user_000001|708e4199-4a97-4fe...|    3|
|user_000001|02fd11c5-7671-418...|    1|
|user_000001|70b41574-58aa-419...|    5|
|user

df_agg: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string, track_id: string ... 1 more field]
df_agg_filtered: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string, track_id: string ... 1 more field]


In [19]:
//revisit to make more efficient

val user_indexer = new StringIndexer()
    .setInputCol("user_id")
    .setOutputCol("user_index")
val track_indexer = new StringIndexer()
    .setInputCol("track_id")
    .setOutputCol("track_index")
val df_user_indexed = user_indexer.fit(df_agg_filtered).transform(df_agg_filtered)
val df_indexed = track_indexer.fit(df_user_indexed).transform(df_user_indexed)
df_indexed.show()

22/07/24 20:13:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:09 WARN DAGScheduler: Broadcasting large task binary with size 1126.5 KiB
+-----------+--------------------+-----+----------+-----------+
|    user_id|            track_id|count|user_index|track_index|
+-----------+--------------------+-----+----------+-----------+
|user_000001|0025055f-39c3-43e...|    2|       3.0|     1207.0|
|user_000001|6f71793f-5de1-499...|    7|       3.0|     8790.0|
|user_

user_indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_083dd3149cbb
track_indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_4f2654566917
df_user_indexed: org.apache.spark.sql.DataFrame = [user_id: string, track_id: string ... 2 more fields]
df_indexed: org.apache.spark.sql.DataFrame = [user_id: string, track_id: string ... 3 more fields]


In [20]:
val user_track_df = df_indexed.select("user_index", "track_index", "count").orderBy("user_index")
user_track_df.show()

22/07/24 20:13:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:13:29 WARN DAGScheduler: Broadcasting large task binary with size 1126.4 KiB
+----------+-----------+-----+
|user_index|track_index|count|
+----------+-----------+-----+
|       0.0|    11040.0|    4|
|       0.0|     4878.0|    4|
|       0.0|     6726.0|    7|
|       0.0|     4599.0|    5|
|       0.0|      451.0|    1|
|       0.0|       21.0|   11|
|       0.0|     6840.0|    1|
|       0.0|    

user_track_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_index: double, track_index: double ... 1 more field]


In [21]:
val Array(training, test) = user_track_df.randomSplit(Array[Double](0.7, 0.3), 18)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_index: double, track_index: double ... 1 more field]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_index: double, track_index: double ... 1 more field]


In [None]:
val als = new ALS()
  .setMaxIter(10)
  .setRegParam(0.01)
  .setUserCol("user_index")
  .setItemCol("track_index")
  .setRatingCol("count")
val model = als.fit(training)

val predictions = model.transform(test)

22/07/24 20:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:14:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:14:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/07/24 20:14:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


In [None]:
// Generate top 10 movie recommendations for each user
val userRecs = model.recommendForAllUsers(10)
// Generate top 10 user recommendations for each movie
val movieRecs = model.recommendForAllItems(10)