## Let's make predictions on test

Firstly, some imports and functions

In [None]:
from pyspark.sql import functions as f, types as t, Window
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.recommendation import ALSModel
from pyspark.ml.feature import StringIndexer


def transform_genre_no_filter(
        ALS_model, ratings_dev, exploded_movies, genre_indexer, prediction_col, group_by_columns=[],
        aggregation_func=f.avg,
):
    exploded_ratings = (
        ratings_dev
        .join(f.broadcast(exploded_movies), on='movieId')
    )
    exploded_ratings = genre_indexer.transform(exploded_ratings).drop('genre')
    return (
        ALS_model.transform(exploded_ratings)
        .groupBy('userId', 'movieId', 'timestamp', *group_by_columns)
        .agg(aggregation_func(prediction_col).alias(prediction_col))
    )


def transform_actor_no_filter(
        ALS_model, ratings_dev, exploded_movies, prediction_col, group_by_columns=[],
        aggregation_func=f.avg,
):
    exploded_ratings = (
        ratings_dev
        .join(f.broadcast(exploded_movies), on='movieId', how='left')
        .withColumn('actor', f.when(f.isnull('actor'), -1).otherwise(f.col('actor')))
    )
    return (
        ALS_model.transform(exploded_ratings)
        .groupBy('userId', 'movieId', 'timestamp', *group_by_columns)
        .agg(aggregation_func(prediction_col).alias(prediction_col))
    )


@f.udf(t.ArrayType(t.StringType()))
def split_genres(genres):
    """Given as a string of genres concatenated with '|', splits it into array"""
    return genres.split('|')


@f.udf(t.ArrayType(t.IntegerType()))
def split_actors(actors):
    """Given as a string of concatenated actor Ids in format 'nm<some_int>', splits it into array"""
    return list(map(int, actors.split('nm')[1:])) if actors else []


def load(paths, has_rating=True):
    if not isinstance(paths, list):
        paths = [paths]
    result = (
        spark.read.csv(paths, header=True)
        .withColumn('userId', f.col('userId').cast('int'))
        .withColumn('movieId', f.col('movieId').cast('int'))
        .withColumn('timestamp', f.col('timestamp').cast('int'))
        .filter(f.col('userId').isNotNull())
    )
    if has_rating:
        result = result.withColumn('rating', f.col('rating').cast('float'))
    return result.cache()

Load models and prepare movies datasets

In [None]:
user_movie_ALS = ALSModel.load('/user/mob2019014/user_movie_ALS_model.bin')
user_genre_ALS = ALSModel.load('/user/mob2019014/user_genre_ALS_model.bin')
user_actor_ALS = ALSModel.load('/user/mob2019014/user_actor_ALS_model.bin')

ratings_test = load('/data/MobodMovieLens/test/ratings.csv', has_rating=False)

movies = spark.read.csv('/data/MobodMovieLens/train/movies.csv', header=True).cache()
exploded_movies = (
    movies
    .drop('title')
    .withColumn('genres', split_genres(movies.genres))
    .withColumn('genre', f.explode('genres')).drop('genres')
)
genre_indexer_creator = StringIndexer(inputCol='genre', outputCol='genreId')
genre_indexer = genre_indexer_creator.fit(exploded_movies)

exploded_by_actors_movies = (
    spark.read.csv('/user/mob2019014/movies_imdb.csv', header=True)
    .withColumn('movieId', f.col('movieId').cast('int'))
    .select('movieId', 'nconst')
    .withColumn('actors', split_actors('nconst'))
    .withColumn('actor', f.explode('actors'))
    .drop('actors', 'nconst')
    .cache()
)

Make ALS predictions

In [None]:
ratings_with_ALS = user_movie_ALS.transform(ratings_test)
ratings_with_ALS = transform_genre_no_filter(
    user_genre_ALS, ratings_with_ALS, exploded_movies, genre_indexer, 'user_genre_ALS',
    group_by_columns=['user_movie_ALS'],
)
ratings_with_ALS = transform_actor_no_filter(
    user_actor_ALS, ratings_with_ALS, exploded_by_actors_movies, 'user_actor_ALS',
    group_by_columns=['user_movie_ALS', 'user_genre_ALS'],
)
ratings_with_ALS.repartition(1).write.csv(
    'ratings_test_with_all_ALS_predictions.csv', header=True, mode='overwrite',
)

Load csv with predictions on disk

In [None]:
import subprocess

command = (
    "hdfs dfs -cat "
    "$(hdfs dfs -ls ratings_test_with_all_ALS_predictions.csv | awk '{if (NR == 3) print $8;}')"
    " > ratings_test_with_all_ALS_predictions.csv"
)
subprocess.call(command, shell=True)