In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


In [2]:
spark

![ALS](pics/mf.png)

## Our goal is to represent the matrix of user-item ratings as the product of two LOW RANK matrices 
## $$R = U \times P^\top $$ 
## where 
## $$U \in \mathbb{R}^{n \times k}, k \ll n$$
## and 
## $$P \in \mathbb{R}^{m \times k}, k \ll m$$

## This means that
## $$R_{i,j} = u_i \bullet p_j$$
## recalling
## $$\cos(\theta) = \frac{a \bullet b}{\|a\|\|b\|}$$

## How do we find a solution? By minimizing the cost function!
## $$J = \|R - U \times P^\top\|_2 + \lambda(\|U\|_2 + \|P\|_2)$$

## How do we optimize this?
![GD](pics/gradient_descent.jpeg)

## There are two problems here though:
+ the number of parameters is $n \times k + m \times k$
+ the loss is non-convex (https://www.quora.com/Why-is-the-matrix-factorization-optimization-function-in-recommender-systems-not-convex)

## So what do we do? ALS (alternating least squares)
## Recall Ordinary Least Squares
## $$J(\beta) = \|y - X\beta\|_2$$
## $$\beta = (X^\top X)^{-1}X^\top y$$

## ALS is a two-step iterative optimization process
## $$ \forall u_i : J(u_i) = \|R_i - u_i \times P^\top\|_2 + \lambda\|u_i\|_2$$
## $$ \forall p_j : J(p_j) = \|R_i - U \times p^{\top}_{j}\|_2 + \lambda\|p_j\|_2$$
## The solution is
## $$u_i = (P^\top \times P + \lambda I)^{-1} \times P^\top \times R_i$$
## $$p_j = (U^\top \times U + \lambda I)^{-1} \times U^\top \times R_j$$

In [12]:
from pyspark.sql.types import *

In [27]:
schema = StructType([
    StructField("user", IntegerType()),
    StructField("item", IntegerType()),
    StructField("rating", IntegerType()),
    StructField("timestamp", LongType())
])

In [28]:
dataset = spark.read.csv("/share/ml-100k/ua.base", sep="\t", schema=schema).drop("timestamp").repartition(20).cache()

In [29]:
dataset.take(5)

[Row(user=297, item=1, rating=3),
 Row(user=119, item=924, rating=4),
 Row(user=373, item=230, rating=4),
 Row(user=900, item=508, rating=3),
 Row(user=13, item=828, rating=1)]

In [30]:
dataset.rdd.getNumPartitions()

20

In [20]:
from pyspark.ml.recommendation import ALS

In [21]:
als = ALS(rank=10, maxIter=5, seed=5757)

In [31]:
model = als.fit(dataset)

In [32]:
model.rank

10

In [33]:
test = spark.read.csv("/share/ml-100k/ua.test", sep="\t", schema=schema).drop("timestamp").repartition(4).cache()

In [35]:
predictions = model.transform(test)

In [41]:
predictions.take(5)

[Row(user=602, item=148, rating=4, prediction=3.5022647380828857),
 Row(user=447, item=148, rating=4, prediction=2.770799398422241),
 Row(user=396, item=148, rating=4, prediction=3.548232078552246),
 Row(user=203, item=148, rating=3, prediction=3.0412800312042236),
 Row(user=251, item=148, rating=2, prediction=3.0381240844726562)]

In [46]:
from pyspark.ml.evaluation import RegressionEvaluator

In [47]:
evaluator = RegressionEvaluator(labelCol="rating", metricName="rmse")

In [48]:
evaluator.evaluate(predictions)

nan

![wtf](pics/Jackie-Chan-WTF.jpg)

In [49]:
predictions.groupBy("rating").count().collect()

[Row(rating=1, count=542),
 Row(rating=3, count=2424),
 Row(rating=5, count=2153),
 Row(rating=4, count=3316),
 Row(rating=2, count=995)]

In [50]:
import pyspark.sql.functions as f

In [54]:
predictions.filter(f.isnan("prediction")).count()

2

In [55]:
predictions.filter(f.isnan("prediction")).collect()

[Row(user=675, item=1653, rating=5, prediction=nan),
 Row(user=405, item=1582, rating=1, prediction=nan)]

In [61]:
dataset.filter(dataset.user == 675).collect()

[Row(user=675, item=235, rating=1),
 Row(user=675, item=311, rating=3),
 Row(user=675, item=937, rating=1),
 Row(user=675, item=258, rating=3),
 Row(user=675, item=900, rating=4),
 Row(user=675, item=242, rating=4),
 Row(user=675, item=750, rating=4),
 Row(user=675, item=86, rating=4),
 Row(user=675, item=272, rating=3),
 Row(user=675, item=244, rating=3),
 Row(user=675, item=318, rating=5),
 Row(user=675, item=891, rating=2),
 Row(user=675, item=344, rating=4),
 Row(user=675, item=1628, rating=5),
 Row(user=675, item=1007, rating=4),
 Row(user=675, item=303, rating=5),
 Row(user=675, item=223, rating=1),
 Row(user=675, item=427, rating=5),
 Row(user=675, item=286, rating=4),
 Row(user=675, item=896, rating=5),
 Row(user=675, item=269, rating=5),
 Row(user=675, item=312, rating=2),
 Row(user=675, item=321, rating=2),
 Row(user=675, item=463, rating=5)]

In [62]:
dataset.filter(dataset.item == 1653).collect()

[]

In [58]:
dataset.filter((dataset.user == 675) & (dataset.item == 1582)).collect()

[]

In [59]:
predictions = predictions.dropna()

In [60]:
evaluator.evaluate(predictions)

0.9590533627741923

## But how do we deal with the cold start in Spark?!

In [65]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop"})

In [69]:
model.getOrDefault("coldStartStrategy")

'drop'

In [70]:
predictions = model.transform(test)

In [71]:
evaluator.evaluate(predictions)

0.9590533627741922

## Can we do better?

In [93]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop", als.maxIter: 20})

In [94]:
predictions = model.transform(test)

In [95]:
evaluator.evaluate(predictions)

0.9558872242636991

## Or even better?

In [97]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop", als.maxIter: 20, als.rank: 100})

In [98]:
predictions = model.transform(test)

In [99]:
evaluator.evaluate(predictions)

0.9475039684939921

## What else do we have here?

In [106]:
model.recommendForAllItems(5).take(5)

[Row(item=1342, recommendations=[Row(user=662, rating=3.8292653560638428), Row(user=849, rating=3.3050215244293213), Row(user=212, rating=3.2683603763580322), Row(user=369, rating=3.1982271671295166), Row(user=157, rating=3.150738000869751)]),
 Row(item=148, recommendations=[Row(user=507, rating=4.532631874084473), Row(user=907, rating=4.493520259857178), Row(user=849, rating=4.490138053894043), Row(user=164, rating=4.4547929763793945), Row(user=438, rating=4.371801376342773)]),
 Row(item=1088, recommendations=[Row(user=279, rating=3.4358346462249756), Row(user=223, rating=3.288766384124756), Row(user=644, rating=3.0603246688842773), Row(user=642, rating=3.019566059112549), Row(user=688, rating=2.9346206188201904)]),
 Row(item=1238, recommendations=[Row(user=519, rating=4.759654521942139), Row(user=36, rating=3.95890474319458), Row(user=341, rating=3.7593958377838135), Row(user=507, rating=3.7583060264587402), Row(user=512, rating=3.724473237991333)]),
 Row(item=471, recommendations=[R

In [107]:
model.recommendForAllUsers(5).take(5)

[Row(user=471, recommendations=[Row(item=932, rating=4.59334659576416), Row(item=8, rating=4.515803337097168), Row(item=422, rating=4.430621147155762), Row(item=102, rating=4.418323040008545), Row(item=465, rating=4.4073286056518555)]),
 Row(user=463, recommendations=[Row(item=19, rating=4.3579511642456055), Row(item=887, rating=4.278357982635498), Row(item=1449, rating=4.222947597503662), Row(item=221, rating=4.208576679229736), Row(item=253, rating=4.199032783508301)]),
 Row(user=833, recommendations=[Row(item=1597, rating=4.580961227416992), Row(item=1019, rating=4.381778240203857), Row(item=1187, rating=4.379821300506592), Row(item=589, rating=4.34868049621582), Row(item=488, rating=4.308185577392578)]),
 Row(user=496, recommendations=[Row(item=56, rating=4.302997589111328), Row(item=320, rating=4.296531677246094), Row(item=42, rating=4.181881427764893), Row(item=921, rating=4.170877456665039), Row(item=1240, rating=4.105301856994629)]),
 Row(user=148, recommendations=[Row(item=169