# Problem 1: Collaborative Filtering Approach

## Implementing the ALS approach for the netflix data

### loading thr data

In [None]:
from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession ,Row

In [3]:
# Change to the location of data files
dbfs_dir = 's3://archanamaroldsde.bucket/'
test = dbfs_dir + '/TestingRatings.txt'
train = dbfs_dir + '/TrainingRatings.txt'

In [4]:
from pyspark.sql.functions import regexp_extract
from pyspark.sql.types import *
tests=sqlContext.read.text(test)
trains=sqlContext.read.text(train)

In [5]:
import pyspark.sql.functions as f
test_data = tests.select(f.split(tests.value,",")).rdd.flatMap(lambda x: x).toDF(schema=["movieID","userID", "rating"])
train_data = trains.select(f.split(trains.value,",")).rdd.flatMap(lambda x: x).toDF(schema=["movieID","userID", "rating"])


In [6]:
#converting types
test_data = test_data.withColumn("movieID",test_data["movieID"].cast(IntegerType()))
test_data = test_data.withColumn("userID",test_data["userID"].cast(IntegerType()))
test_data = test_data.withColumn("rating",test_data["rating"].cast(FloatType()))

train_data = train_data.withColumn("movieID",train_data["movieID"].cast(IntegerType()))
train_data = train_data.withColumn("userID",train_data["userID"].cast(IntegerType()))
train_data = train_data.withColumn("rating",train_data["rating"].cast(FloatType()))

In [7]:
test_data.cache()
train_data.cache()
assert test_data.is_cached
assert train_data.is_cached

In [8]:
train_data_count = train_data.count()
test_data_count = test_data.count()
print('There are %s rows in the train datasets' % (train_data_count))
print('There are %s rows in the test datasets' % (test_data_count))

There are 3255352 rows in the train datasets
There are 100478 rows in the test datasets


### split the test data into validation and test data

In [11]:
seed = 1800009193

In [15]:
( test_data1, test_data2) =(test_data.randomSplit([ 0.5, 0.5], seed = 1800009193))

### ALS model 

In [16]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

als = ALS()

In [17]:
from pyspark.ml.recommendation import ALS

als = ALS()

als.setMaxIter(5)\
   .setSeed(seed)\
   .setRegParam(0.1)\
   .setUserCol("userID").setItemCol("movieID").setRatingCol("rating")


reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

tolerance = 0.03
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
  #Set
  als.setRank(rank)
  #
  model = als.fit(train_data)
  # 
  predict_df = model.transform(test_data1)

  # 
  predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan'))
  error = reg_eval.evaluate(predicted_ratings_df)
  errors[err] = error
  models[err] = model
  print('For rank %s the RMSE is %s' % (rank, error))
  if error < min_error:
    min_error = error
    best_rank = err
  err += 1
als.setRank(ranks[best_rank])
print('The best model was trained with rank %s' % ranks[best_rank])
my_model = models[best_rank]

For rank 4 the RMSE is 0.870632365922693
For rank 8 the RMSE is 0.8595219560970704
For rank 12 the RMSE is 0.8662888532087158
The best model was trained with rank 8


### Prediction and RMSE evaluator

In [19]:
predict_df = my_model.transform(test_data2)

predicted_test_df = predict_df.filter(predict_df.prediction != float('nan'))

test_RMSE = reg_eval.evaluate(predicted_test_df)
print('The model had a RMSE on the test set of {0}'.format(test_RMSE))

The model had a RMSE on the test set of 0.8637122515990271
