# Model to Predict Movie Ratings

## Construct Model

In [33]:
# import libraries

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from cold_start import get_cold_start_rating
import pyspark
import time

In [2]:
# instantiate a SparkSession object
spark = pyspark(.sql
                .SparkSession
                .builder
                .master("local[*]")
                .getOrCreate())

In [3]:
# import ratings json file into spark dataframe

movie_ratings = spark.read.json('data/ratings.json')

In [4]:
# check schema
movie_ratings.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- user_id: long (nullable = true)



In [5]:
# cast to Pandas dataframe to turn timestamp data to datetime and check nulls. 

movies_df = movie_ratings.select('*').toPandas()
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719949 entries, 0 to 719948
Data columns (total 4 columns):
movie_id     719949 non-null int64
rating       719949 non-null int64
timestamp    719949 non-null float64
user_id      719949 non-null int64
dtypes: float64(1), int64(3)
memory usage: 22.0 MB


In [6]:
# attempt to change timestamp object to years, all years are 2000

date = pd.to_datetime(movies_df['timestamp'], unit='s').dt.year
date.value_counts()

2000    719949
Name: timestamp, dtype: int64

In [7]:
# Decide to drop timestamp for now because only year 2000

movie_ratings = movie_ratings.drop('timestamp')

In [8]:
# Split data into training and test set

(training, test) = movie_ratings.randomSplit([.8, .2])

In [9]:
# Create ALS instance and fit model

als = ALS(maxIter=10,
          rank=10,
          userCol='user_id',
          itemCol='movie_id',
          ratingCol='rating',
          seed=42)

model = als.fit(training)

In [10]:
# Generate predictions

predictions = model.transform(test)
predictions.persist()

DataFrame[movie_id: bigint, rating: bigint, user_id: bigint, prediction: float]

In [17]:
# Convert to pandas dataframe
pred_df = predictions.select('*').toPandas()

# Check nulls
print(pred_df['prediction'].isna().sum())

# Fill-in functions to give user average that was used before coldstart function complete
def user_average(user, df):
    """Return average score for user"""
    user_df = df[df['user_id'] == user]
    average = user_df['prediction'].mean()
    if np.isnan(average):
        return 3
    else:
        return average
    
def compute_user_average_if_null(row):
    """Check if value is null, if so, replace with user average"""
    if np.isnan(row['prediction']):
        return user_average(row['user_id'], pred_df)
    else:
        return row['prediction']
    
# Make pandas dataframes from csv files used for coldstart function
user_df = pd.read_csv('data/user_cluster.csv', index_col=0) 
u_clusters = pd.read_csv('data/u_info.csv', index_col=0)
ratings_df = pd.read_csv('data/movie_cluster_avg.csv', index_col=0)

# Print remaining nulls
print(pred_df['prediction'].isna().sum())

# Fill nulls with using coldstart function
for i, row in pred_df[pred_df['prediction'].isna()].iterrows():
    pred_df.loc[i, 'prediction'] = get_cold_start_rating(row['user_id']
                                                         , row['movie_id']
                                                         , user_df
                                                         , u_clusters
                                                         , ratings_df
                                                        )

# Check that all nulls are gone
print(pred_df['prediction'].isna().any())
    
# Convert back to spark dataframe
predictions = spark.createDataFrame(pred_df)

In [18]:
# Evaluate model 
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                               predictionCol='prediction')

rmse = evaluator.evaluate(predictions)
print(rmse)

0.8772095007527893


In [None]:
# Create a parameter grid
params = (ParamGridBuilder()
          .addGrid(als.regParam, [1, 0.01, 0.001, 0.1])
          .addGrid(als.maxIter, [5, 10, 20])
          .addGrid(als.rank, [4, 10, 50])).build()

# Cross validate for best hyperparameters
cv = CrossValidator(estimator=als, estimatorParamMaps=params, evaluator=evaluator, parallelism=4)

# Fit and store model
best_model = cv.fit(movie_ratings)
als_model = best_model.bestModel

## Use Model to Predict Ratings From Requests Data

In [19]:
# Load requests json file into a spark dataframe
requests = spark.read.json("data/requests.json") 

# Predict requests with best ALS model and convert to Pandas dataframe
requests_predictions = model.transform(requests).toPandas()

# Print remaining nulls
print(requests_predictions['prediction'].isna().any())

# Predict null predictions with coldstart model
for i, row in requests_predictions[requests_predictions['prediction'].isna()].iterrows():
    requests_predictions.loc[i, 'prediction'] = get_cold_start_rating(row['user_id'], row['movie_id']
                                                                     ,user_df
                                                                     ,u_clusters
                                                                     ,ratings_df)

# Print remaining nulls
print(requests_predictions['prediction'].isna().any())

# Fill remaining nulls with rating of 3
requests_predictions.loc[requests_predictions['prediction'].isna(), ['prediction']] = 3

# Check that no nulls remain
print(requests_predictions['prediction'].isna().any())

# Export request predictions to dataframe to json file
cols = ['user_id','movie_id', 'rating', 'timestamp', 'prediction']
requests_predictions = requests_predictions[cols]
requests_predictions.to_json(r"data/predictions.json"
                                           ,orient='records'
                                           ,lines=True
                                          )

## Generate Top 10 User Recommendations

In [None]:
def get_recommendation_for_user(user_id):
    """Return top ten user recommendations"""
    
    import json
    with open("data/predictions.json", "r") as r:
    data = []
    for i in r:
        data.append(json.loads(i)) 
    
    recommendation_list = []
    
    for i in data:
        if i['user_id'] == user_id:
            recommendation_list.append({'movie_id': i['movie_id'], 'rating': i['prediction']})
    print(recommendation_list)

    sorted_list = sorted(recommendation_list, key=lambda k: k['rating'], reverse=True) 
    return sorted_list[:10]