In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
from datetime import datetime
from catboost import CatBoostRegressor
import pickle

Imports and fucntions to calc ALS

In [None]:
from pyspark.sql import functions as f, types as t, Window
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.recommendation import ALSModel
from pyspark.ml.feature import StringIndexer


def transform_genre_no_filter(
        ALS_model, ratings_dev, exploded_movies, genre_indexer, prediction_col, group_by_columns=[],
        aggregation_func=f.avg,
):
    exploded_ratings = (
        ratings_dev
        .join(f.broadcast(exploded_movies), on='movieId')
    )
    exploded_ratings = genre_indexer.transform(exploded_ratings).drop('genre')
    return (
        ALS_model.transform(exploded_ratings)
        .groupBy('userId', 'movieId', 'timestamp', *group_by_columns)
        .agg(aggregation_func(prediction_col).alias(prediction_col))
    )


def transform_actor_no_filter(
        ALS_model, ratings_dev, exploded_movies, prediction_col, group_by_columns=[],
        aggregation_func=f.avg,
):
    exploded_ratings = (
        ratings_dev
        .join(f.broadcast(exploded_movies), on='movieId', how='left')
        .withColumn('actor', f.when(f.isnull('actor'), -1).otherwise(f.col('actor')))
    )
    return (
        ALS_model.transform(exploded_ratings)
        .groupBy('userId', 'movieId', 'timestamp', *group_by_columns)
        .agg(aggregation_func(prediction_col).alias(prediction_col))
    )


@f.udf(t.ArrayType(t.StringType()))
def split_genres(genres):
    """Given as a string of genres concatenated with '|', splits it into array"""
    return genres.split('|')


@f.udf(t.ArrayType(t.IntegerType()))
def split_actors(actors):
    """Given as a string of concatenated actor Ids in format 'nm<some_int>', splits it into array"""
    return list(map(int, actors.split('nm')[1:])) if actors else []


def load(paths, has_rating=True):
    if not isinstance(paths, list):
        paths = [paths]
    result = (
        spark.read.csv(paths, header=True)
        .withColumn('userId', f.col('userId').cast('int'))
        .withColumn('movieId', f.col('movieId').cast('int'))
        .withColumn('timestamp', f.col('timestamp').cast('int'))
        .filter(f.col('userId').isNotNull())
    )
    if has_rating:
        result = result.withColumn('rating', f.col('rating').cast('float'))
    return result.cache()

Load model and data

In [None]:
user_movie_ALS = ALSModel.load('/user/mob2019014/user_movie_ALS_model.bin')
user_genre_ALS = ALSModel.load('/user/mob2019014/user_genre_ALS_model.bin')
user_actor_ALS = ALSModel.load('/user/mob2019014/user_actor_ALS_model.bin')

ratings_test = load('/data/MobodMovieLens/test/ratings.csv', has_rating=False)

movies = spark.read.csv('/data/MobodMovieLens/train/movies.csv', header=True).cache()
exploded_movies = (
    movies
    .drop('title')
    .withColumn('genres', split_genres(movies.genres))
    .withColumn('genre', f.explode('genres')).drop('genres')
)
genre_indexer_creator = StringIndexer(inputCol='genre', outputCol='genreId')
genre_indexer = genre_indexer_creator.fit(exploded_movies)

exploded_by_actors_movies = (
    spark.read.csv('/user/mob2019014/movies_imdb.csv', header=True)
    .withColumn('movieId', f.col('movieId').cast('int'))
    .select('movieId', 'nconst')
    .withColumn('actors', split_actors('nconst'))
    .withColumn('actor', f.explode('actors'))
    .drop('actors', 'nconst')
    .cache()
)

Make predictions

In [None]:
ratings_with_ALS = user_movie_ALS.transform(ratings_test)
ratings_with_ALS = transform_genre_no_filter(
    user_genre_ALS, ratings_with_ALS, exploded_movies, genre_indexer, 'user_genre_ALS',
    group_by_columns=['user_movie_ALS'],
)
ratings_with_ALS = transform_actor_no_filter(
    user_actor_ALS, ratings_with_ALS, exploded_by_actors_movies, 'user_actor_ALS',
    group_by_columns=['user_movie_ALS', 'user_genre_ALS'],
)
ratings_with_ALS.repartition(1).write.csv(
    'ratings_test_with_all_ALS_predictions.csv', header=True, mode='overwrite',
)

Load them on disk from hdfs

In [None]:
import subprocess

command = (
    "hdfs dfs -cat "
    "$(hdfs dfs -ls ratings_test_with_all_ALS_predictions.csv | awk '{if (NR == 3) print $8;}')"
    " > ratings_test_with_all_ALS_predictions.csv"
)
subprocess.call(command, shell=True)

Code to load data from google drive

In [14]:
# code from stackoverflow
# https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
  
import requests


def download_file_from_google_drive(file_id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params={'id': file_id}, stream=True)
    token = get_confirm_token(response)

    if token:
        params = {'id': file_id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [5]:
als_final_test = pd.read_csv('ratings_test_with_all_ALS_predictions.csv')

Download IMDb dataset

In [None]:
download_file_from_google_drive('1j9JLdhT8ud5ps44CK6TJiT9bugJ4NPs_', 'movies_imdb.csv')
movies_imdb = pd.read_csv('movies_imdb.csv')
movies_imdb['averageRating'] = movies_imdb['averageRating'] / 2
genres = [
    [el for el in x.split(',')] + ['None'] * (3 - len(x.split(',')))
    for x in movies_imdb['genres_imdb']
]
movies_imdb['genres_imdb_0'] = [el[0] for el in genres]
movies_imdb['genres_imdb_1'] = [el[1] for el in genres]
movies_imdb['genres_imdb_2'] = [el[2] for el in genres]
movies_imdb['runtimeMinutes'][movies_imdb['runtimeMinutes'] == '\\N'] = -1

Download linear models and make their predictions

In [None]:
# it may take around 30 mins :)
# download_file_from_google_drive('1L8Ed-yVotKTgTTUki_58mfUIQXVgUAmy', 'inference.zip')
# ! mkdir linear_model_inference && mv inference.zip linear_model_inference
# ! cd linear_model_inference && unzip inference.zip
# ! cd linear_model_inference && ./inference.sh ../ratings_test_with_all_ALS_predictions.csv
# ! mv linear_model_inference/predictions.csv linear_predictions.csv

# an alternative - simply download linear model scores from google drive:
download_file_from_google_drive('1oKeLczBEaFYi_gJQRQ1GzyXSAUwa8Pql', 'linear_predictions.csv')

Merge linear predictions

In [23]:
lin_features_final_test = pd.read_csv('linear_predictions.csv', header=None)
lin_features_final_test.rename(
    columns={0: 'fold', 1: 'userId', 2: 'movieId', 3: 'timestamp', 4: 'lin_pred'},
    inplace=True,
)
als_final_test = pd.merge(
    als_final_test, lin_features_final_test, how='left', 
    left_on=['userId', 'movieId', 'timestamp'], 
    right_on=['userId', 'movieId', 'timestamp'],
)

Merge movies_imdb to test data and add time features

In [7]:
als_final_test = pd.merge(als_final_test, movies_imdb, how='left', left_on=['movieId'], right_on=['movieId'])
als_final_test['genres_imdb_0'].fillna('None', inplace=True)
als_final_test['genres_imdb_1'].fillna('None', inplace=True)
als_final_test['genres_imdb_2'].fillna('None', inplace=True)
als_final_test['titleType'].fillna('None', inplace=True)
als_final_test['curr_date'] = list(map(lambda x: datetime.fromtimestamp(x).isoformat(), als_final_test['timestamp']))
als_final_test['curr_year'] = list(map(lambda x: float(x[:4]), als_final_test['curr_date']))
als_final_test['curr_month'] = list(map(lambda x: x[5:7], als_final_test['curr_date']))
als_final_test['movie_age'] = als_final_test['curr_year'] - als_final_test['year']

In [8]:
feature_columns = [
    'user_movie_ALS', 
    'user_genre_ALS', 
    'user_actor_ALS', 
    'averageRating', 
    'isAdult', 
    'runtimeMinutes', 
    'numVotes', 
    'movie_age',
    'lin_pred'
]

In [10]:
download_file_from_google_drive('1eByw002RVPc0Fi7791PGBp7gHyA14TiW', 'catboost_model_v4.pkl')
with open('catboost_model_v4.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

In [11]:
als_final_test['prediction'] = model.predict(als_final_test[feature_columns])

In [12]:
als_final_test.to_csv(
    'prediction.csv',
    index=False,
    columns=['movieId', 'userId', 'timestamp', 'prediction']
)