#### Imports

In [24]:
import os

### Small Movie Lens Dataset

In [25]:
small_dataset_path = './../../Datasets/movieLenSmall'

#### Ratings

In [26]:
small_ratings_path = os.path.join(small_dataset_path, 'ratings.csv')
small_ratings = sc.textFile(name=small_ratings_path)

Remove the header line from the ratings

In [27]:
small_ratings_header = small_ratings.take(1)[0]

In [28]:
small_ratings_header

'userId,movieId,rating,timestamp'

In [29]:
small_ratings_data = small_ratings.filter(lambda line: line!=small_ratings_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

In [30]:
small_ratings_data.take(3)

[('1', '1', '4.0'), ('1', '3', '4.0'), ('1', '6', '4.0')]

#### Movies

In [31]:
small_movies_path = os.path.join(small_dataset_path, 'movies.csv')
small_movies = sc.textFile(name=small_movies_path)

In [32]:
small_movies_header = small_movies.take(1)[0]

In [33]:
small_movies_header

'movieId,title,genres'

In [34]:
small_movies_data = small_movies.filter(lambda line: line!=small_movies_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()

In [35]:
small_movies_data.take(3)

[('1', 'Toy Story (1995)'),
 ('2', 'Jumanji (1995)'),
 ('3', 'Grumpier Old Men (1995)')]

#### Declare Datasets

In [36]:
training_small_ratings, validation_small_ratings_temp, test_small_ratings_temp = small_ratings_data.randomSplit([6,2,2], seed= 400)

In [37]:
validation_small_ratings = validation_small_ratings_temp.map(lambda x: (x[0], x[1]))

In [38]:
test_small_ratings = test_small_ratings_temp.map(lambda x: (x[0], x[1]))

In [39]:
validation_small_ratings.take(5)

[('1', '47'), ('1', '50'), ('1', '110'), ('1', '333'), ('1', '362')]

### Training a recommender with Aternarting Least Squares

In [40]:
from pyspark.mllib.recommendation import ALS
import math

#### Setting the training parameters

In [22]:
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0

In [18]:
min_error = 1e3
best_rank = -1
best_iteration = -1

In [19]:
for rank in ranks:
    
    model = ALS.train(training_small_ratings, rank, seed=0, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_small_ratings).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_small_ratings_temp.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print (f'For rank {rank} the RMSE is {error}')
    if error < min_error:
        min_error = error
        best_rank = rank

For rank 4 the RMSE is 0.9004611692709928
For rank 8 the RMSE is 0.9035024272363643
For rank 12 the RMSE is 0.9068175717565644


In [20]:
print(f'The best model was derived from rank:{best_rank} with a minimum error of {min_error}')

The best model was derived from rank:4 with a minimum error of 0.9004611692709928


In [31]:
predictions.take(3)

[((232, 45208), 1.4357010972030038),
 ((368, 3272), 3.131348041802032),
 ((610, 52328), 4.125442899653168)]

In [32]:
model = ALS.train(training_small_ratings, rank, seed=5, iterations=iterations,
                  lambda_=regularization_parameter)
predictions = model.predictAll(test_small_ratings).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_small_ratings_temp.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

In [33]:
print(f'Test set RMSE error: {error}')

Test set RMSE error: 0.918218211778835


#### Making predictions

Loading the movies file:

In [41]:
small_movies_file = os.path.join(small_dataset_path, 'movies.csv')
small_movies = sc.textFile(small_movies_file)
small_movies_header = small_movies.take(1)[0]

In [42]:
small_movies_data = small_movies.filter(lambda line: line!=small_movies_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),tokens[1],tokens[2])).cache()

In [43]:
small_movies_titles = small_movies_data.map(lambda x: (int(x[0]),x[1]))

In [44]:
small_movies_titles.take(10)

[(1, 'Toy Story (1995)'),
 (2, 'Jumanji (1995)'),
 (3, 'Grumpier Old Men (1995)'),
 (4, 'Waiting to Exhale (1995)'),
 (5, 'Father of the Bride Part II (1995)'),
 (6, 'Heat (1995)'),
 (7, 'Sabrina (1995)'),
 (8, 'Tom and Huck (1995)'),
 (9, 'Sudden Death (1995)'),
 (10, 'GoldenEye (1995)')]

In [45]:
def avg(lst):
    
    return sum(lst)/len(lst)

small_movie_ID_with_ratings_RDD = (small_ratings_data.map(lambda x: (x[1], x[2])).groupByKey()).map(lambda x : (x[0], [float(elem) for elem in list(x[1])]))
small_movie_rating_counts_RDD = small_movie_ID_with_ratings_RDD.map(lambda x: (int(x[0]), (avg(x[1]), len(x[1]))))

In [46]:
small_movie_rating_counts_RDD.take(3)

[(1, (3.9209302325581397, 215)),
 (50, (4.237745098039215, 204)),
 (70, (3.5090909090909093, 55))]

Create a fake user with ID 0 and assign some ratings

In [148]:
new_user_ID = 0

# The format of each line is (userID, movieID, rating)
new_user_ratings = [
     (0,1,5), # Toy Story (1995)
     (0,2,5), # Jumanji (1995)
     (0,3,2), # Grumpier Old Men (1995)
     (0,4,1), # Waiting to Exhale (1995)
     (0,5,1), # Father of the Bride Part II (1995)
     (0,6,3), # Heat (1995)
     (0,7,1), # Sabrina (1995)
     (0,8,2), # Tom and Huck (1995)
     (0,9,3) , # Sudden Death (1995)
     (0,10,4) # GoldenEye (1995)
    ]
new_user_ratings_RDD = sc.parallelize(new_user_ratings)

In [149]:
complete_data_with_new_ratings_RDD = small_ratings_data.union(new_user_ratings_RDD)

In [129]:
from time import time

start = time()

new_ratings_model = ALS.train(complete_data_with_new_ratings_RDD, best_rank, seed=0, 
                              iterations=iterations, lambda_=regularization_parameter)

print(f'Model trained in {time() - start} seconds')

Model trained in 1.9609730243682861 seconds


In [150]:
new_user_ratings_ids = map(lambda x: x[1], new_user_ratings) # get just movie IDs
new_user_unrated_movies_RDD = (small_movies_data.filter(lambda x: x[0] not in new_user_ratings_ids).map(lambda x: (new_user_ID, x[0])))

# Use the input RDD, new_user_unrated_movies_RDD, with new_ratings_model.predictAll() to predict new ratings for the movies
new_user_recommendations_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD)

Show the extracted recommendations

In [153]:
new_user_recommendations_rating_title_and_count_RDD.take(2)

[(69720, ((3.845444347349426, 'Hood of Horror (2006)'), (4.5, 1))),
 (3240, ((3.0143997470382344, '"Big Tease'), (3.25, 2)))]

In [154]:
new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating))
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_RDD.join(small_movies_titles).join(small_movie_rating_counts_RDD)
new_user_recommendations_rating_title_and_count_RDD.take(5) 

[(69720, ((3.845444347349426, 'Hood of Horror (2006)'), (4.5, 1))),
 (3240, ((3.0143997470382344, '"Big Tease'), (3.25, 2))),
 (98160, ((1.2273797070898143, 'Nature Calls (2012)'), (1.5, 1))),
 (1320,
  ((2.8746905379925787, 'Alien³ (a.k.a. Alien 3) (1992)'),
   (3.148936170212766, 47))),
 (2040,
  ((3.0644923806019424, '"Computer Wore Tennis Shoes'),
   (2.857142857142857, 7)))]

In [155]:
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))

Take top recommendations with more than 20 reviews:

In [169]:
top_movies = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2][1]>=20).takeOrdered(10, key=lambda x: -x[1])

In [170]:
for elem in top_movies:
    
    print('-----------')
    print(f'{elem[0]}, Rate {elem[2][0]} from {elem[2][1]} users, with predicted review: {elem[1]}')

-----------
Three Colors: Blue (Trois couleurs: Bleu) (1993), Rate 3.8958333333333335 from 24 users, with predicted review: 4.5125877441539695
-----------
The Lego Movie (2014), Rate 3.870967741935484 from 31 users, with predicted review: 4.465857609239915
-----------
Moonrise Kingdom (2012), Rate 3.7758620689655173 from 29 users, with predicted review: 4.430650711775922
-----------
"Philadelphia Story, Rate 4.310344827586207 from 29 users, with predicted review: 4.420493800894658
-----------
Little Big Man (1970), Rate 4.145833333333333 from 24 users, with predicted review: 4.366707043305809
-----------
Thor: Ragnarok (2017), Rate 4.025 from 20 users, with predicted review: 4.358617201286712
-----------
Logan (2017), Rate 4.28 from 25 users, with predicted review: 4.350759028924431
-----------
Dr. Horrible's Sing-Along Blog (2008), Rate 3.9166666666666665 from 24 users, with predicted review: 4.350511012919956
-----------
Dogville (2003), Rate 4.025 from 20 users, with predicted revie

### MovieLens Big dataset 

In [171]:
full_dataset_path = './../../Datasets/MovieLensFull'
ratings_path = os.path.join(full_dataset_path, 'ratings.csv')
ratings = sc.textFile(name=ratings_path)

In [None]:
ratings_header = ratings.take(1)[0]
ratings_data = ratings.filter(lambda line: line!=ratings_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache()

In [178]:
movies_file = os.path.join(full_dataset_path, 'movies.csv')
movies = sc.textFile(small_movies_file)
movies_header = small_movies.take(1)[0]
movies_data = movies.filter(lambda line: line!=movies_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),tokens[1],tokens[2])).cache()

In [181]:
movies_titles = movies_data.map(lambda x: (int(x[0]),x[1]))

In [182]:
movie_ID_with_ratings_RDD = (ratings_data.map(lambda x: (x[1], x[2])).groupByKey()).map(lambda x : (x[0], [float(elem) for elem in list(x[1])]))
movie_rating_counts_RDD = movie_ID_with_ratings_RDD.map(lambda x: (int(x[0]), (avg(x[1]), len(x[1]))))

In [183]:
movie_rating_counts_RDD.take(1)

[(1449, (3.918377748652978, 6867))]

#### Create a user with reviews

In [205]:
full_user_ID = 0

# The format of each line is (userID, movieID, rating)
full_new_user_ratings = [
     (0,260,4), # Star Wars (1977)
     (0,1,3), # Toy Story (1995)
     (0,16,3), # Casino (1995)
     (0,25,4), # Leaving Las Vegas (1995)
     (0,32,4), # Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
     (0,335,1), # Flintstones, The (1994)
     (0,379,1), # Timecop (1994)
     (0,296,3), # Pulp Fiction (1994)
     (0,858,5) , # Godfather, The (1972)
     (0,50,4) # Usual Suspects, The (1995)
    ]
full_new_user_ratings_RDD = sc.parallelize(full_new_user_ratings)

In [201]:
complete_ratings = ratings_data.union(full_new_user_ratings_RDD)

#### Full model

In [202]:
training_RDD, test_RDD = complete_ratings.randomSplit([7, 3])

In [203]:
start = time()

full_model = ALS.train(training_RDD, best_rank, seed=0, 
                              iterations=iterations, lambda_=regularization_parameter)

print(f'Full model trained in {time() - start} seconds')

Full model trained in 54.59347486495972 seconds


In [211]:
full_user_ratings_ids = map(lambda x: x[1], full_new_user_ratings) # get just movie IDs
full_user_unrated_movies_RDD = (movies_data.filter(lambda x: x[0] not in full_user_ratings_ids).map(lambda x: (full_user_ID, x[0])))

# Use the input RDD, new_user_unrated_movies_RDD, with new_ratings_model.predictAll() to predict new ratings for the movies
full_user_recommendations_RDD = full_model.predictAll(full_user_unrated_movies_RDD)

In [None]:
full_user_recommendations_rating_RDD = full_user_recommendations_RDD.map(lambda x: (x.product, x.rating))
full_user_recommendations_rating_title_and_count_RDD = \
    full_user_recommendations_rating_RDD.join(small_movies_titles).join(movie_rating_counts_RDD)

In [None]:
full_user_recommendations_rating_title_and_count_RDD = \
    full_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))

#### Extract full dataset top recommendations

In [None]:
top_movies_full = full_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2][1]>=20).takeOrdered(10, key=lambda x: -x[1])

In [None]:
for elem in top_movies_full:
    
    print('-----------')
    print(f'{elem[0]}, Rate {elem[2][0]} from {elem[2][1]} users, with predicted review: {elem[1]}')