# Problem 3: Collaborative Filtering Implementation

### Table of content
#### 1) Loading the files
#### 2) User-based Collaborative Filtering :
#####     Pivot 
#####     converting to matrix 
#####     calculating the similarity
#####     calculating rmse 

In [1]:
# Change to the location of data files
dbfs_dir = 's3://archanamaroldsde.bucket/'
test = dbfs_dir + '/TestingRatings.txt'
train = dbfs_dir + '/TrainingRatings.txt'
names= dbfs_dir + '/movie_titles.txt'

In [2]:
test

's3://archanamaroldsde.bucket//TestingRatings.txt'

In [3]:
#MovieID,UserID,Rating

In [4]:
from pyspark.sql.functions import regexp_extract
from pyspark.sql.types import *
import pandas
tests=sqlContext.read.text(test)
trains=sqlContext.read.text(train)
movie_names=sqlContext.read.text(names)

In [5]:
import pyspark.sql.functions as f
test_data = tests.select(f.split(tests.value,",")).rdd.flatMap(lambda x: x).toDF(schema=["movieID","userID", "rating"])
train_data = trains.select(f.split(trains.value,",")).rdd.flatMap(lambda x: x).toDF(schema=["movieID","userID", "rating"])
movie_name = movie_names.select(f.split(movie_names.value,",")).rdd.flatMap(lambda x: x).toDF(schema=["movieID","year", "title"])

In [6]:
#converting types
test_data = test_data.withColumn("movieID",test_data["movieID"].cast(IntegerType()))
test_data = test_data.withColumn("userID",test_data["userID"].cast(IntegerType()))
test_data = test_data.withColumn("rating",test_data["rating"].cast(FloatType()))

train_data = train_data.withColumn("movieID",train_data["movieID"].cast(IntegerType()))
train_data = train_data.withColumn("userID",train_data["userID"].cast(IntegerType()))
train_data = train_data.withColumn("rating",train_data["rating"].cast(FloatType()))

movie_name = movie_name.withColumn("movieID",movie_name["movieID"].cast(IntegerType()))
movie_name = movie_name.withColumn("year",movie_name["year"].cast(IntegerType()))
movie_name = movie_name.withColumn("title",movie_name["title"].cast(StringType()))

In [7]:
print(test_data.show(3))
print(train_data.show(3))
print(movie_name.show(3))

+-------+-------+------+
|movieID| userID|rating|
+-------+-------+------+
|      8| 573364|   1.0|
|      8|2149668|   3.0|
|      8|1089184|   3.0|
+-------+-------+------+
only showing top 3 rows

None
+-------+-------+------+
|movieID| userID|rating|
+-------+-------+------+
|      8|1744889|   1.0|
|      8|1395430|   2.0|
|      8|1205593|   4.0|
+-------+-------+------+
only showing top 3 rows

None
+-------+----+--------------------+
|movieID|year|               title|
+-------+----+--------------------+
|      1|2003|     Dinosaur Planet|
|      2|2004|Isle of Man TT 20...|
|      3|1997|           Character|
+-------+----+--------------------+
only showing top 3 rows

None


In [8]:
test_data.cache()
train_data.cache()
assert test_data.is_cached
assert train_data.is_cached

In [9]:
train_data_count = train_data.count()
test_data_count = test_data.count()
print('There are %s rows in the train datasets' % (train_data_count))
print('There are %s rows in the test datasets' % (test_data_count))

There are 3255352 rows in the train datasets
There are 100478 rows in the test datasets


## User-based Collaborative Filtering 

In [None]:
import sklearn
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
import math
import pandas as pd
from math import sqrt
from sklearn.metrics.pairwise import pairwise_distances

#### Pivoting the data for user-user filtering

In [79]:
test_data_matrix=test_data.pivot(index='userID', columns='movieID', values='rating')
test_data_matrix1=test_data_matrix.fillna(0)


#### Converting the data into a sparse matrix

In [82]:
test_data_matrix2=test_data_matrix1.values
test_data_matrix2

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

#### pivoting and converting to matrix for train dataset

In [81]:
train_data_matrix=train_data.pivot(index='userID', columns='movieID', values='rating')
train_data_matrix1=train_data_matrix.fillna(0)
train_data_matrix2=train_data_matrix1.values

In [83]:
train_data_matrix2

array([[ 5.,  4.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  4.,  0., ...,  0.,  0.,  0.]])

#### calculating the similiarity using correlation metric 
#### User-similarity and Item-similarity

In [84]:
user_similarity1=pairwise_distances(train_data_matrix2, metric='correlation')

In [85]:
item_similarity1 = pairwise_distances(train_data_matrix2.T, metric='correlation')

In [106]:
def predict(rating, similarity, type1):
    if type1=='user':
        mean_user_rating=rating.mean(axis=1)
        #print('rating',rating.head(2))
        #print(mean_user_rating.shape)
        rating_diff=(rating-mean_user_rating[:, np.newaxis])
        #print(rating_diff.shape)
        pred=mean_user_rating[:,np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
    elif type1 == 'item':
        pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [107]:
user_pred=predict(train_data_matrix2, user_similarity1, type1='user')


In [108]:
item_pred = predict(train_data_matrix2, item_similarity1, type1='item')

In [110]:
print(user_pred)
#print(item_pred)

[[ 0.30873919  1.55873792  0.01425974 ...,  0.01367812  0.13005357
   0.03352018]
 [ 0.26556504  1.4820479  -0.04185526 ..., -0.04224655  0.07490007
  -0.02278229]
 [ 0.24052187  1.55577568 -0.05571908 ..., -0.05610593  0.0554129
  -0.03475516]
 ..., 
 [ 0.2393442   1.59472163 -0.0460224  ..., -0.04676259  0.06388082
  -0.02584937]
 [ 0.26144077  1.49979728 -0.03859018 ..., -0.03912957  0.07507935
  -0.0195167 ]
 [ 0.26647638  1.44693597 -0.03846407 ..., -0.03871254  0.07485795
  -0.01942068]]


#### RMSE and MSE

In [111]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth)), mean_squared_error(prediction, ground_truth)

In [92]:
rmse,mse=rmse(user_pred, test_data_matrix2)
print('User-based MSE : ' ,mse)
print('User-based RMSE : ' ,rmse)

User-based MSE :  12.0061213538
User-based RMSE :  3.4649850438084355


In [113]:
rmse1,mse1=rmse(item_pred, test_data_matrix2)
print('Item-based MSE : ' ,mse1)
print('Item-based RMSE : ',rmse1)

Item-based MSE :  11.8230753587
Item-based RMSE :  3.438469915343497
