#### Imports

In [72]:
import os
from pyspark.mllib.recommendation import ALS
import math

### Loading the Netflix dataset

In [73]:
netflix_dataset_path = "./../../Datasets/Netflix-prize-data/"

In [74]:
combined_data_1_location = os.path.join(netflix_dataset_path, "combined_data_1.txt")
combined_data_2_location = os.path.join(netflix_dataset_path, "combined_data_2.txt")
combined_data_3_location = os.path.join(netflix_dataset_path, "combined_data_3.txt")
combined_data_4_location = os.path.join(netflix_dataset_path, "combined_data_4.txt")

help_location = f'{combined_data_1_location},{combined_data_2_location},{combined_data_3_location},{combined_data_4_location}'

Combine all data to the same sark context:

In [75]:
combined_data = sc.textFile(help_location).cache()

In [76]:
print(f'There are {combined_data.count()} elements in the combined RDD')

There are 100498277 elements in the combined RDD


The available data represent a movie id, followed by many reviews from different users. We wish to structure the data and group them by each individual user, since our target is to predict the rating that each user would assign to a movie. The following example shows that for the movied with id 1, the user with id 1488844 has rated it with 3 stars out of 5, with the rating date to be the 6th of Spetmeber, 2005. The next lines follow the same pattern, until all reviews are exhausted and then a single line with the number '2:' will show up, followed by the corresponding reviews for the second movie etc.

In order to efficiently perform the Alternating Least Squares algorith, we will need to re-map the form of these lines into the pattern 'user_id, movie_id, rating'. We do not care about the date of the ratings, since we do not intend to use them in our model. 

#### Spark transformations

Remove the dates of ratings from the data:

In [77]:
def remove_dates(single_line):
    """
    Removes the date of reviews. 
    
    Inputs:
        single line -> A single line of the original RDD, represented as a list fo strings.
        
    Outputs:
        new_line -> A list of strings without the date. 
        
    We need to take special care of the lines that represent a movie id, which only have a single element 
    in their list representation.
    """
    
    if len(single_line)==1:
        return single_line
    else:
        return single_line[:-1]

In [78]:
no_dates_rdd = combined_data.map(lambda line: line.split(",")).map(lambda lst: remove_dates(lst)).cache()

In [79]:
no_dates_rdd.take(10)

[['1:'],
 ['1488844', '3'],
 ['822109', '5'],
 ['885013', '4'],
 ['30878', '4'],
 ['823519', '3'],
 ['893988', '3'],
 ['124105', '4'],
 ['1248029', '3'],
 ['1842128', '4']]

#### Assign a movie id to the rating

In [80]:
def append_acc(file_path, acc):
    
    movie_id = -1
    with open(file_path, 'r') as cd:
        for line in cd:
            if ':' in line:
                movie_id = line.split(':')[0]
            else:
                acc.append(movie_id)
    
    return acc

In [82]:
acc = append_acc(combined_data_1_location, [])
acc = append_acc(combined_data_2_location, acc)
acc = append_acc(combined_data_3_location, acc)
acc = append_acc(combined_data_4_location, acc)

In [83]:
len(acc)

100480507

From inspecting the 'combined_data_1.txt' file we can see that the movie id 2 is found in line 549. Taking into account that lines are indexed from number 1, and the first line contains the movie id 1, we expect to meet 547 reviews for the movie with id 1. furthermore, we expect to see that the 547th review is about the movie with id 2.

In [84]:
acc[546]

'1'

In [85]:
acc[547]

'2'

In order to be completely sure that no mistake was made, we check if there is an unexpected value to the first 546 elements

In [86]:
def find_unexpected(lst, correct_elem):
    """
    Finds if an unexpected element is found among elem in a list.
    
    Inputs:
        lst -> The list under consideration.
        elem -> The expected element to the list.
        
    Returns:
        None
        
    Outputs:
        A message of correctness or not.
    """
    
    for index, elem in enumerate(lst):
        if elem!=correct_elem:
            print(f'Found unexpected element: {elem} in position {index}')
            return
                  
    print("No unexpected element was found!")

In [87]:
find_unexpected(acc[:547], '1')

No unexpected element was found!


In [88]:
find_unexpected(acc[:548], '1')

Found unexpected element: 2 in position 547


Remove the movie id elements fromm the RDD

In [89]:
no_movie_ids_rdd = no_dates_rdd.filter(lambda elem: ':' not in elem[0]).cache()

We need to ensure that the number of elements in this RDD is the same with the number of elements in the accumulator:

In [91]:
no_movie_ids_rdd.count()

100480507

In [19]:
no_movie_ids_rdd.take(5)

[['1488844', '3'],
 ['822109', '5'],
 ['885013', '4'],
 ['30878', '4'],
 ['823519', '3']]

Since it is the same, we can proceed to the next steps!

Zip the RDDs with their index:

In [20]:
indexed_rdd = no_movie_ids_rdd.zipWithIndex().cache()

In [97]:
indexed_rdd.count()

100480507

In [21]:
indexed_rdd.take(5)

[(['1488844', '3'], 0),
 (['822109', '5'], 1),
 (['885013', '4'], 2),
 (['30878', '4'], 3),
 (['823519', '3'], 4)]

Now we can use the accumulator along with the indexes to assign a movie ID to each review, in the form of 
(user_id, movie_id, rating):

In [98]:
joint_rdd = indexed_rdd.map(lambda tpl: (int(tpl[0][0]), int(acc[tpl[1]]), int(tpl[0][1]))).cache()

In [99]:
joint_rdd.count()

100480507

In [115]:
joint_rdd.take(5)

[(1488844, 1, 3),
 (822109, 1, 5),
 (885013, 1, 4),
 (30878, 1, 4),
 (823519, 1, 3)]

Next step is to reorganize the reviews based on the user id. First we need to create a key-value pair schema:

In [116]:
key_value_schema = joint_rdd.map(lambda elem: (elem[0], (elem[1], elem[2]))).cache()

In [117]:
key_value_schema.count()

100480507

In [118]:
key_value_schema.take(5)

[(1488844, (1, 3)),
 (822109, (1, 5)),
 (885013, (1, 4)),
 (30878, (1, 4)),
 (823519, (1, 3))]

No we just need to sort by value and revert to the required form:

In [123]:
sorted_rdd = key_value_schema.sortByKey().cache()

In [124]:
sorted_rdd.count()

100480507

In [125]:
sorted_rdd.take(5)

[(6, (30, 3)), (6, (157, 3)), (6, (173, 4)), (6, (175, 5)), (6, (191, 2))]

Restore the sorted rdd in the requested format:

In [126]:
final_rdd = sorted_rdd.map(lambda elem: (elem[0], elem[1][0], elem[1][1])).cache()

In [129]:
final_rdd.count()

100480507

In [128]:
final_rdd.take(5)

[(6, 30, 3), (6, 157, 3), (6, 173, 4), (6, 175, 5), (6, 191, 2)]

## Alternating Least Squares in the Netflix Dataset

We split the data in a training, validation and test set. We have a huge number of data, so the split will be performed in 80%-10%-10% ratio respectively.

In [130]:
training_set, validation_set, test_set = final_rdd.randomSplit([8,1,1], seed= 400)

In [131]:
training_set.take(5)

[(6, 30, 3), (6, 175, 5), (6, 191, 2), (6, 197, 3), (6, 241, 3)]

In [132]:
validation_set.take(5)

[(6, 564, 4), (6, 658, 3), (6, 825, 3), (6, 1145, 3), (6, 1180, 3)]

In [133]:
test_set.take(5)

[(6, 157, 3), (6, 173, 4), (6, 445, 3), (6, 501, 3), (6, 705, 3)]

We also need to have a second instance of the validation and test sets without their ratings, so that we can predict the reviews and compare with the real ones:

In [134]:
validation_set_to_predict = validation_set.map(lambda elem: (elem[0], elem[1]))
test_set_to_predict = test_set.map(lambda elem: (elem[0], elem[1]))

#### Setting a handful of training parameters:

In [135]:
iterations = 10
regularization_parameters = [0.01, 0.05, 0.1]
ranks = [2, 4, 6, 8, 10, 12]

In [136]:
min_error = 1e3
best_rank = -1
best_regularization = -1

In [137]:
for rank in ranks:
    
    for regularization_parameter in regularization_parameters:
    
        model = ALS.train(training_set, rank, seed=0, iterations=iterations,
                          lambda_=regularization_parameter)
        predictions = model.predictAll(validation_set_to_predict).map(lambda elem: ((elem[0], elem[1]), elem[2]))
        true_and_predicted_rates = validation_set.map(lambda elem: ((int(elem[0]), int(elem[1])), float(elem[2]))).join(predictions)
        error = math.sqrt(true_and_predicted_rates.map(lambda elem: (elem[1][0] - elem[1][1])**2).mean())
        print('--------------------------------------')
        print (f'For rank {rank} and amount of regularization {regularization_parameter} the RMSE is {error}')
        if error < min_error:
            min_error = error
            best_rank = rank
            best_regularization = regularization_parameter

--------------------------------------
For rank 2 and amount of regularization 0.01 the RMSE is 0.8880375533537901
--------------------------------------
For rank 2 and amount of regularization 0.05 the RMSE is 0.8874903673135274
--------------------------------------
For rank 2 and amount of regularization 0.1 the RMSE is 0.89035637791651
--------------------------------------
For rank 4 and amount of regularization 0.01 the RMSE is 0.8757235966026625


KeyboardInterrupt: 