In [2]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.2.0 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))

exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


In [3]:
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import StructType,\
                              StructField,\
                              IntegerType,\
                              FloatType

In [4]:
test = spark.read.csv(
    '/labs/lab09data/test.csv',
    sep=',',
    header=True,
    schema=StructType([StructField("userId", IntegerType(), True),
                       StructField("movieId", IntegerType(), True),
                       StructField("rating", FloatType(), True)]))

In [5]:
train = spark.read.csv(
    '/labs/lab09data/train.csv',
    sep=',',
    header=True,
    schema=StructType([StructField("userId", IntegerType(), True),
                       StructField("movieId", IntegerType(), True),
                       StructField("rating", FloatType(), True)]))

train_count = train.count()

In [6]:
train_avg = train.groupBy('movieId').agg(F.avg(F.col('rating')).name('avg'))

In [7]:
als = ALS()
als.setMaxIter(20)\
   .setRank(26)\
   .setSeed(train_count)\
   .setRegParam(0.1)\
   .setUserCol('userId')\
   .setItemCol('movieId')\
   .setRatingCol('rating') 

ALS_40138de8ca7720226665

In [8]:
model = als.fit(train)

In [9]:
df = model.transform(test)\
          .join(train_avg.select(['movieId', 'avg']), on='movieId', how='left')\
          .select('userId', 
                  'movieId', 
                  'rating',
                  F.nanvl(F.col('prediction'), F.col('avg')).name('prediction'))\
          .select('userId', 'movieId', F.col('prediction').name('rating'))\
          .orderBy(['userId', 'movieId'])

#write to HDFS
# df.coalesce(1)\
#   .write\
#   .format("com.databricks.spark.csv")\
#   .option("header", "true")\
#   .save('lab09.csv')

In [10]:
#write to local file system
df.toPandas().to_csv("lab09s.csv",index=False)