# Problem 2: Analyzing the Netflix Data

In [1]:
# Change to the location of data files
dbfs_dir = 's3://archanamaroldsde.bucket/'
books = dbfs_dir + '/amazon_Books.csv'


In [2]:
books

's3://archanamaroldsde.bucket//amazon_Books.csv'

In [3]:
from pyspark.sql.types import *

book_df_schema = StructType(
  [StructField('userId', StringType()),
   StructField('bookId', StringType()),
   StructField('rating', DoubleType())]
)

In [4]:
from pyspark.sql.functions import regexp_extract
from pyspark.sql.types import *

book_df = sqlContext.read.format('csv').options(header=True, inferSchema=False).schema(book_df_schema).load(books)
book_df.cache()


DataFrame[userId: string, bookId: string, rating: double]

In [5]:
book_df.show(5)

+--------------+------+------+
|        userId|bookId|rating|
+--------------+------+------+
|A2IIIDRK3PRRZY|   116|   1.0|
|A1TADCM7YWPQ8M|   868|   4.0|
| AWGH7V0BDOJKB| 13714|   4.0|
|A3UTQPQPM4TQO0| 13714|   5.0|
| A8ZS0I5L5V31B| 13714|   5.0|
+--------------+------+------+
only showing top 5 rows



In [48]:
book_df.count()

1048575

In [49]:
1048575*0.8

838860.0

In [6]:
train, test =book_df.randomSplit([0.8, 0.2])

In [51]:
test.count()

209976

In [7]:
train.show(3)

+--------------------+----------+------+
|              userId|    bookId|rating|
+--------------------+----------+------+
|A00039763E5V43M02...|000727405X|   4.0|
|A00369122K166QHZJ...|   7124015|   5.0|
|A00652961QVJY2CGH...|   2007770|   4.0|
+--------------------+----------+------+
only showing top 3 rows



## User-based Collaborative Filtering 

The process for creating a User Based recommendation system is as follows:
1.	First select a user with the movies the user has watched

2.	Based on his rating to movies, find the top X neighbours/ similar users

3.	Get the watched movie record of the user for each neighbour.

4.	Calculate a similarity score using the formula

5.	Recommend the items with the highest score



In [8]:
#test_data1=test_data.toPandas()
test_data_matrix=test1.pivot(index='userId', columns='bookId', values='rating')
test_data_matrix1=test_data_matrix.fillna(0)
test_data_matrix2=test_data_matrix1.values

In [9]:
test_data_matrix2

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [10]:
train_data_matrix=train1.pivot(index='userId', columns='bookId', values='rating')
train_data_matrix1=train_data_matrix.fillna(0)
train_data_matrix2=train_data_matrix1.values

In [11]:
train_data_matrix2

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [12]:
user_similarity1=pairwise_distances(train_data_matrix2, metric='correlation')

In [13]:
item_similarity1 = pairwise_distances(train_data_matrix2.T, metric='correlation')

In [14]:
def predict(rating, similarity, type1):
    if type1=='user':
        mean_user_rating=rating.mean(axis=1)
        #print('rating',rating.head(2))
        #print(mean_user_rating.shape)
        rating_diff=(rating-mean_user_rating[:, np.newaxis])
        #print(rating_diff.shape)
        pred=mean_user_rating[:,np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
    elif type1 == 'item':
        pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [18]:
user_pred=predict(train_data_matrix2, user_similarity1, type1='user')


In [19]:
item_pred = predict(train_data_matrix2, item_similarity1, type1='item')

In [26]:
#print(user_pred)
print(item_pred)

[[ 0.00095057  0.00095034  0.00095034 ...,  0.00095034  0.00095034
   0.00095034]
 [ 0.00118757  0.00118761  0.00118761 ...,  0.00118761  0.00118761
   0.00118761]
 [ 0.00118769  0.00118767  0.00118767 ...,  0.00118767  0.00118767
   0.00118767]
 ..., 
 [ 0.00118994  0.0011888   0.0011888  ...,  0.0011888   0.0011888
   0.0011888 ]
 [ 0.00118757  0.00118761  0.00118761 ...,  0.00118761  0.00118761
   0.00118761]
 [ 0.00118769  0.00118767  0.00118767 ...,  0.00118767  0.00118767
   0.00118767]]


In [27]:
import math
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth)), mean_squared_error(prediction, ground_truth)

In [22]:
rmse,mse=rmse(user_pred, test_data_matrix2)
print('User-based MSE : ' ,mse)
print('User-based RMSE : ' ,rmse)

User-based MSE :  19.2464876858
User-based RMSE :  4.387081910084406


In [30]:
rmse1,mse1=rmse(item_pred, test_data_matrix2)
print('Item-based MSE : ' ,mse1)
print('Item-based RMSE : ',rmse1)

Item-based MSE :  18.564849
Item-based RMSE : 4.30869458188904
