In [1]:
sc

In [2]:
#pip install sklearn

In [3]:
#pip install seaborn

In [4]:
from pyspark.sql.types import *
from pyspark.sql.functions import regexp_extract
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [5]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [6]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import scipy.sparse as sp
from scipy.sparse.linalg import svds

## Problem 1: Collaborative Filtering Approach

#### Data Loading Entire Process

In [7]:
dbfs_dir = 's3://dsci6007bucket/Netflix/'
training = dbfs_dir + '/TrainingRatings.txt'
test = dbfs_dir + '/TestingRatings.txt'
movie = dbfs_dir + '/movie_titles.txt'

In [8]:
trainingstructure = StructType([StructField('MovieID', IntegerType()),StructField('UniqueID', IntegerType()),StructField('Ratings',DoubleType())])
teststructure = StructType([StructField('MovieID', IntegerType()),StructField('UniqueID', IntegerType()),StructField('Ratings',DoubleType())])
moviestructure = StructType([StructField('MovieID', IntegerType()),StructField('ReleaseYear', IntegerType()),StructField('Title', StringType())])

In [9]:
trainingfile = sc.textFile(training)
testfile = sc.textFile(test)
moviefile = sc.textFile(movie)

In [10]:
trainingdata = sqlContext.read.format('txt').options(inferSchema = True).schema(trainingstructure).csv(training)
testdata = sqlContext.read.format('txt').options(inferSchema = True).schema(teststructure).csv(test)
moviedata = sqlContext.read.format('txt').options(inferSchema = True).schema(moviestructure).csv(movie)

In [11]:
print(trainingdata.cache())
print(testdata.cache())
print(moviedata.cache())

DataFrame[MovieID: int, UniqueID: int, Ratings: double]
DataFrame[MovieID: int, UniqueID: int, Ratings: double]
DataFrame[MovieID: int, ReleaseYear: int, Title: string]


In [12]:
assert trainingdata.is_cached
assert testdata.is_cached
assert moviedata.is_cached

In [13]:
trainingdf = trainingdata.toPandas()
print(trainingdf.head())

                                                                                

   MovieID  UniqueID  Ratings
0        8   1744889      1.0
1        8   1395430      2.0
2        8   1205593      4.0
3        8   1488844      4.0
4        8   1447354      1.0


In [14]:
testdf = testdata.toPandas()
print(testdf.head())

   MovieID  UniqueID  Ratings
0        8    573364      1.0
1        8   2149668      3.0
2        8   1089184      3.0
3        8   2465894      3.0
4        8    534508      1.0


In [15]:
train, testing= trainingdata.randomSplit([0.75, 0.25])
als = ALS(maxIter=5, regParam=0.01, userCol="UniqueID", itemCol="MovieID", ratingCol="Ratings",coldStartStrategy="drop")
model = als.fit(train)

                                                                                

#### RMSE Comparison of 3 Ranks

In [16]:
rankrange = [4, 8, 12]
for i in range(len(rankrange)):
    print(i)
    als = ALS(maxIter = 10, rank = rankrange[i], regParam = 0.01, userCol="UniqueID", 
    itemCol = "MovieID", ratingCol = "Ratings",coldStartStrategy="drop")
    model = als.fit(train)
    forecast = model.transform(testing)
    confirm = RegressionEvaluator(metricName="rmse", labelCol="Ratings",predictionCol="prediction")
    rmse = confirm.evaluate(forecast)
    print("The Root Mean Square Error is", str(rmse))
    forecast.show()

0


                                                                                

The Root Mean Square Error is 0.8648067575422841


                                                                                

+-------+--------+-------+----------+
|MovieID|UniqueID|Ratings|prediction|
+-------+--------+-------+----------+
|     28|  446160|    3.0| 4.0298233|
|     28| 1629521|    3.0|  2.837424|
|     28| 2250628|    2.0|  3.257722|
|    111|  675056|    3.0| 3.2367122|
|    111| 1497891|    2.0|  2.368916|
|    122|   15846|    4.0| 3.1624823|
|    156|  279120|    3.0| 2.9683099|
|    305| 1552084|    5.0| 4.1221414|
|    359|   15846|    5.0|  4.079527|
|    361|  128389|    2.0| 3.8315132|
|    442|  279120|    3.0| 3.5625172|
|    442| 1434507|    5.0|  5.007493|
|    442| 2250628|    3.0| 3.2701845|
|    442| 2311863|    4.0|  3.798782|
|    443|  637596|    3.0| 3.4835799|
|    443|  973051|    5.0| 4.6220527|
|    443| 1189060|    5.0| 3.7346563|
|    443| 1497891|    4.0| 3.5812035|
|    443| 2531111|    5.0| 3.8557158|
|    452|  637596|    3.0|  2.806849|
+-------+--------+-------+----------+
only showing top 20 rows

1


                                                                                

The Root Mean Square Error is 0.8577371588780843


                                                                                

+-------+--------+-------+----------+
|MovieID|UniqueID|Ratings|prediction|
+-------+--------+-------+----------+
|     28|  446160|    3.0|  3.516647|
|     28| 1629521|    3.0| 3.0581145|
|     28| 2250628|    2.0| 3.4961662|
|    111|  675056|    3.0| 3.5252223|
|    111| 1497891|    2.0|  2.107324|
|    122|   15846|    4.0| 3.4785147|
|    156|  279120|    3.0| 3.0542338|
|    305| 1552084|    5.0|  4.062698|
|    359|   15846|    5.0| 3.2768033|
|    361|  128389|    2.0| 3.6015518|
|    442|  279120|    3.0| 3.5616462|
|    442| 1434507|    5.0| 5.1433578|
|    442| 2250628|    3.0| 3.0847418|
|    442| 2311863|    4.0|    3.9226|
|    443|  637596|    3.0| 3.9304638|
|    443|  973051|    5.0|  4.726842|
|    443| 1189060|    5.0| 3.7877574|
|    443| 1497891|    4.0| 3.3820739|
|    443| 2531111|    5.0|  3.888372|
|    452|  637596|    3.0|  3.132868|
+-------+--------+-------+----------+
only showing top 20 rows

2


                                                                                

The Root Mean Square Error is 0.8716361185269648


                                                                                

+-------+--------+-------+----------+
|MovieID|UniqueID|Ratings|prediction|
+-------+--------+-------+----------+
|     28|  446160|    3.0|  3.632871|
|     28| 1629521|    3.0| 2.6459694|
|     28| 2250628|    2.0|  3.273481|
|    111|  675056|    3.0| 3.5345328|
|    111| 1497891|    2.0| 2.1757822|
|    122|   15846|    4.0|  3.069762|
|    156|  279120|    3.0| 2.8434906|
|    305| 1552084|    5.0|  4.236683|
|    359|   15846|    5.0| 3.5888903|
|    361|  128389|    2.0| 3.7081928|
|    442|  279120|    3.0| 3.7547154|
|    442| 1434507|    5.0| 5.0243587|
|    442| 2250628|    3.0|  3.063021|
|    442| 2311863|    4.0| 3.8958054|
|    443|  637596|    3.0| 4.1032968|
|    443|  973051|    5.0| 4.8048162|
|    443| 1189060|    5.0|  3.874553|
|    443| 1497891|    4.0|   4.00542|
|    443| 2531111|    5.0|  4.032094|
|    452|  637596|    3.0| 3.7973113|
+-------+--------+-------+----------+
only showing top 20 rows



#### Result:
0, 1, 2 correspond to ranks 4, 8, and 12 respectively. We can see that Rank 8 is the best model because it has the lowest RMSE of all the 3 ranks. Rank 8 RMSE is about 0.857.

#### Model Prediction and Evaluation on RMSE

In [17]:
forecast = model.transform(testing)
confirm = RegressionEvaluator(metricName="rmse", labelCol="Ratings",predictionCol="prediction")
rmse = confirm.evaluate(forecast)
print("The Root Mean Square Error on the test set is", str(rmse))



The Root Mean Square Error on the test set is 0.8716361185269645


                                                                                

## Problem 2: Analyzing the Netflix Data

#### Distinct items and how many distinct users present in the test set

In [18]:
len(testdf['UniqueID'].unique())

27555

In [19]:
len(testdf['MovieID'].unique())

1701

#### movie_for_user rating process

In [20]:
def get_the_mean_value(user):
    total = 0
    count = 0
    for y in range(len(user)):
        if(user[y]==0):
            total = total + 0
        else:
            total = user[y] +total
            count = 1 + count
    average = total/count
    return average

In [21]:
movie_for_user = trainingdf.pivot_table(index='UniqueID', columns='MovieID', values='Ratings')
movie_for_user.head()

MovieID,8,28,43,48,61,64,66,92,96,111,...,17654,17660,17689,17693,17706,17725,17728,17734,17741,17742
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,5.0,4.0,,,,,,,,,...,,,,,,,,,,
79,,,,,,,,,,,...,,,,,,,,,,
199,,,,,,,,,,4.0,...,,,,,,,,,,
481,,,,,,,,,,5.0,...,,,,,,,,,,
769,,,,,,,,,,,...,,,,,,,,,,


In [22]:
movie_for_user.fillna(0, inplace= True)
movie_for_user.head()

MovieID,8,28,43,48,61,64,66,92,96,111,...,17654,17660,17689,17693,17706,17725,17728,17734,17741,17742
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
movie_for_user[8915]

UniqueID
7          0.0
79         0.0
199        4.0
481        0.0
769        3.0
          ... 
2648869    4.0
2648885    4.0
2649120    4.0
2649267    0.0
2649285    3.0
Name: 8915, Length: 28978, dtype: float64

In [24]:
user1664010= list(movie_for_user.loc[1664010])
avg1664010= get_the_mean_value(user1664010)
print(avg1664010)

4.2384364820846905


In [25]:
user2439493= list(movie_for_user.loc[2439493])
avg2439493= get_the_mean_value(user2439493)
print(avg2439493)

1.225609756097561


In [26]:
user305344= list(movie_for_user.loc[305344])
avg305344= get_the_mean_value(user305344)
print(avg305344)

1.904382470119522


In [27]:
user387418 = list(movie_for_user.loc[387418])
avg387418 = get_the_mean_value(user387418)
print(avg387418)

1.8405963302752293


In [28]:
user1314869 = list(movie_for_user.loc[1314869])
avg1314869 = get_the_mean_value(user1314869)
print(avg1314869)

2.970984455958549


In [29]:
estimate_item_user = (avg1314869+avg387418+avg305344+avg2439493+avg1664010)/5
print("From these 5 users, The Average Movie Rating by user", estimate_item_user)

From these 5 users, The Average Movie Rating by user 2.4360018989071106


#### user_for_movie rating process

In [30]:
user_for_movie = trainingdf.pivot_table(index='MovieID', columns='UniqueID', values='Ratings')
user_for_movie.head()

UniqueID,7,79,199,481,769,906,1310,1333,1427,1442,...,2648572,2648589,2648730,2648734,2648853,2648869,2648885,2649120,2649267,2649285
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,3.0,,,...,,,,,,,,,,
28,4.0,,,,,3.0,3.0,2.0,,4.0,...,,3.0,4.0,,2.0,,4.0,,,4.0
43,,,,,,,,,,,...,,,,,,,,,,
48,,,,,,,,,,,...,,,,,,,,,,
61,,,,,,,,,,,...,,,,,,,,,,


In [31]:
user_for_movie.fillna(0, inplace= True)
user_for_movie.head()

UniqueID,7,79,199,481,769,906,1310,1333,1427,1442,...,2648572,2648589,2648730,2648734,2648853,2648869,2648885,2649120,2649267,2649285
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,4.0,0.0,0.0,0.0,0.0,3.0,3.0,2.0,0.0,4.0,...,0.0,3.0,4.0,0.0,2.0,0.0,4.0,0.0,0.0,4.0
43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
movie6971 = list(user_for_movie.loc[6971])
avg6971 = get_the_mean_value(movie6971)
print(avg6971)

4.071815611748076


In [33]:
movie4640 = list(user_for_movie.loc[4640])
avg4640 = get_the_mean_value(movie4640)
print(avg4640)

4.047438894792774


In [34]:
movie6287 = list(user_for_movie.loc[6287])
avg6287 = get_the_mean_value(movie6287)
print(avg6287)

3.7261099495756977


In [35]:
movie9728 = list(user_for_movie.loc[9728])
avg9728 = get_the_mean_value(movie9728)
print(avg6971)

4.071815611748076


In [36]:
movie8915 = list(user_for_movie.loc[8915])
avg8915 = get_the_mean_value(movie8915)
print(avg8915)

3.9663373050469826


In [37]:
estimate_user_item = avg8915+avg6971+avg6287+avg4640+avg6971
print("From these 5 users, The Average User Rating", estimate_user_item/5)

From these 5 users, The Average User Rating 3.976703474582321


#### Which way of implementing the collaborative filtering approach is best for the given evaluation task?

It would seem like the movie (item of interest) rated by the user is the more objective measure. The reason is that different
users will provide different ratings. At the end, the movie rating is the average of all the users, in this case 5 users. Although one is free to disagree with the average movie ratings, this rating does give an idea of how the users view the movie.
On the other hand, the user that rate a movie (item of interest) may be very bias depending on personal taste, likes, and preference which are all highly subjective matters.

#### Normalized Ratings

In [38]:
matrix_normalization =  movie_for_user - np.asarray([(np.mean(movie_for_user, 1))]).T
matrix_normalization.head()

MovieID,8,28,43,48,61,64,66,92,96,111,...,17654,17660,17689,17693,17706,17725,17728,17734,17741,17742
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,4.777046,3.777046,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,...,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954,-0.222954
79,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,...,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749,-0.16749
199,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,3.846238,...,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762,-0.153762
481,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,4.823174,...,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826,-0.176826
769,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,...,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884,-0.171884


## Problem 3: Collaborative Filtering Implementation

#### Getting the data ready...

In [39]:
trainingtable, testtable = train_test_split(trainingdf, test_size = 0.25)
trainarray = trainingtable.pivot_table(index = 'UniqueID', columns = 'MovieID', values = 'Ratings')
trainarray.fillna(0, inplace = True)
testarray = testtable.pivot_table(index = 'UniqueID', columns = 'MovieID', values = 'Ratings')
testarray.fillna(0, inplace = True)
traindimension = trainarray.to_numpy()
ratings_of_the_users_average = np.mean(traindimension, axis = 1)
train_reshape = traindimension - ratings_of_the_users_average.reshape(-1, 1)

#### SVD Stage (stands for Singular Value Decomposition)

In [40]:
U, S, V_Transp = svds(train_reshape)
S = np.diag(S)
users_totality = np.dot(np.dot(U, S), V_Transp) + ratings_of_the_users_average.reshape(-1, 1)

In [41]:
forecastings = pd.DataFrame(users_totality, columns = trainarray.columns, 
                            index= trainarray.index)
forecastings.tail()

MovieID,8,28,43,48,61,64,66,92,96,111,...,17654,17660,17689,17693,17706,17725,17728,17734,17741,17742
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2648869,0.522585,0.231504,-0.003525,-0.086089,0.002356,-0.002428,-0.006135,-0.00642,-0.002542,0.476015,...,-0.131968,0.003044,-0.001055,-0.201421,-0.007765,0.301805,0.011801,0.005803,0.023196,0.021788
2648885,0.27705,1.683538,0.021706,0.452267,0.013704,0.013908,0.02138,0.028802,0.033697,0.402557,...,0.221535,0.012956,0.015952,0.309657,0.045311,0.275144,0.018462,0.014191,0.214611,0.011063
2649120,0.349837,0.050659,-0.018566,0.1551,-0.023863,-0.024746,-0.020364,-0.022851,-0.008233,-0.401182,...,-0.083384,-0.024414,-0.023578,-0.092587,-0.008872,0.22483,-0.012888,-0.01868,0.084731,-0.014136
2649267,0.229961,1.397211,0.014349,-0.008613,0.021715,0.018866,0.013281,0.020276,0.01014,1.402492,...,0.148803,0.021408,0.018165,0.124554,0.013146,0.130964,0.018434,0.017598,0.062568,0.030099
2649285,0.156677,2.211554,-0.038692,0.146915,-0.035583,-0.03839,-0.038361,-0.036402,-0.033038,1.081855,...,0.300799,-0.0382,-0.04083,0.360806,-0.033768,-0.004399,-0.041461,-0.040503,0.066474,-0.012897


In [42]:
moviedf = moviedata.toPandas()
print(moviedf.head())

   MovieID  ReleaseYear                         Title
0      1.0       2003.0               Dinosaur Planet
1      2.0       2004.0    Isle of Man TT 2004 Review
2      3.0       1997.0                     Character
3      4.0       1994.0  Paula Abdul's Get Up & Dance
4      5.0       2004.0      The Rise and Fall of ECW


In [43]:
#Very Important and Needed Movie Recommendation Function
def movie_testimony(expected, UniqueIdentifier, moviedata, ratingsinitial, 
                     recomm_count = 10):
    startplace = UniqueIdentifier
    predictionsordered = expected.loc[startplace].sort_values(ascending=False) # UserID starts at 1
    
    customers = ratingsinitial[ratingsinitial.UniqueID == (UniqueIdentifier)]
    consumers = (customers.merge(moviedata, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Ratings'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(UniqueIdentifier, consumers.shape[0]))
    print ('Recommending highest {0} predicted ratings movies not already rated.'.format(recomm_count))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    testimony = (moviedata[~moviedata['MovieID'].isin(consumers['MovieID'])].
         merge(pd.DataFrame(predictionsordered).reset_index(), how = 'left',left_on = 'MovieID',right_on = 'MovieID').
         rename(columns = {startplace: 'Predictions'}).
         sort_values('Predictions', ascending = False).iloc[:recomm_count, :-1])
    return consumers, testimony

In [44]:
#Calling the defined function above
prior_rating, predictions = movie_testimony(forecastings, 738747, moviedf, trainingdf, 10)

User 738747 has already rated 125 movies.
Recommending highest 10 predicted ratings movies not already rated.


In [45]:
#Training Data Mean Absolute error
train_MAE = math.sqrt(mean_squared_error(users_totality, train_reshape))
print ('The Training Data Mean Absolute error is ', train_MAE)

The Training Data Mean Absolute error is  0.6537363004987473


Lets try predicting first for our smaller test set.

In [46]:
tested = testarray.to_numpy()
user_ratings_mean_test = np.mean(tested, axis = 1)
tested_demean = tested - user_ratings_mean_test.reshape(-1, 1)

In [47]:
U, S, V_Transp = svds(tested_demean)
S = np.diag(S)
all_user_predicted_ratings = np.dot(np.dot(U, S), V_Transp) + user_ratings_mean_test.reshape(-1, 1)
math.sqrt(mean_squared_error(all_user_predicted_ratings,tested_demean))

0.4239759839684019

In [48]:
usertest_movie_rating = testdf.pivot_table(index='UniqueID', columns='MovieID', values='Ratings')
usertest_movie_rating.fillna(0, inplace= True)
usertest_movie_rating.head()

MovieID,8,28,43,48,61,64,66,92,96,111,...,17653,17654,17689,17693,17706,17725,17728,17734,17741,17742
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
finaltest = usertest_movie_rating.to_numpy()
fusertest_ratings_mean = np.mean(finaltest, axis = 1)
ftest_demean = finaltest - fusertest_ratings_mean.reshape(-1, 1)

In [50]:
U, S, V_Transp = svds(ftest_demean)
S = np.diag(S)
final_all_user_predicted_ratings = np.dot(np.dot(U, S), V_Transp) + fusertest_ratings_mean.reshape(-1, 1)

Evaluate the approach for our overall testing data.

In [51]:
#RMSE of test data
math.sqrt(mean_squared_error(final_all_user_predicted_ratings,ftest_demean))

0.16349229229029635

In [52]:
#MAE of test data
mean_absolute_error(ftest_demean,final_all_user_predicted_ratings)

0.02178685635406887