# Movie Recommender

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## Import the data

### Find one user or aggregate many users  
    1) find the most similar user or aggregate user
    2) identify that user's highly rated movies

### Metrics to judge similarity
        - based on aggregrate rating of a movie
        - based on what the rating
        - based on what genres they watched the most of
        - based on how frequently they rate movies
        - something with tags on the movie? Sentiment analysis?
        - scrape data from imdb for a critic's review
        - timestamps?

### Misc  
    - User liked a movie = T/F if their rating > aggregate movie rating
    - 

In [1]:
# set up sparkcontext
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("Movie Recommender")
sc = SparkContext(conf = conf) 

In [42]:
# create ratings RDDs
ratingsRDD = sc.textFile("Data/ratings.csv")
ratingsRDD = ratingsRDD.map(lambda x: tuple(x.split(',')))

In [43]:
# remove header
from itertools import islice
ratingsRDD = ratingsRDD.mapPartitionsWithIndex(lambda idx, it: islice(it, 1, None) if idx == 0 else it)

In [44]:
# convert datatypes in RDD
ratingsRDD = ratingsRDD.map(lambda x: (int(x[0]), int(x[1]), float(x[2]), int(x[3])))

In [45]:
ratingsRDD.take(5)

[(1, 31, 2.5, 1260759144),
 (1, 1029, 3.0, 1260759179),
 (1, 1061, 3.0, 1260759182),
 (1, 1129, 2.0, 1260759185),
 (1, 1172, 4.0, 1260759205)]

In [46]:
# create key-value pairs of movieid and user rating
ratingsbymovieid = ratingsRDD.map(lambda x: (x[1], x[2]))

In [47]:
ratingsbymovieid.take(5)

[(31, 2.5), (1029, 3.0), (1061, 3.0), (1129, 2.0), (1172, 4.0)]

In [48]:
# create a tuple of (movieid, (sumofratings, numberof ratings))
averagemovierating = ratingsbymovieid.mapValues(lambda x: (x,1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [49]:
# reduced the previous data frame to (movieid, averagemovierating)
averagemovierating = averagemovierating.map(lambda x: (x[0], x[1][0]/x[1][1]))

In [50]:
#converted to kev-value pairs
ratingsRDD = ratingsRDD.map(lambda x: (x[1], x))

In [51]:
# joined with the average movie rating on the movie id key
ratingsRDD = ratingsRDD.join(averagemovierating)

In [53]:
# fixed join formatting issues
ratingsRDD = ratingsRDD.map(lambda x: (x[0], (x[1][0][0], x[1][0][1], x[1][0][2], x[1][0][3], x[1][1])))

In [54]:
# added in "liked" data - if user rating > than average movie ratin, the value is true
ratingsRDD = ratingsRDD.map(lambda x: (x[0], (x[1][0], x[1][1], x[1][2], x[1][3], x[1][4], x[1][2]>x[1][4])))

In [55]:
ratingsRDD.take(5)

[(1172, (1, 1172, 4.0, 1260759205, 4.260869565217392, False)),
 (1172, (23, 1172, 5.0, 1148670101, 4.260869565217392, True)),
 (1172, (38, 1172, 4.5, 1389867840, 4.260869565217392, True)),
 (1172, (56, 1172, 2.0, 1470350810, 4.260869565217392, False)),
 (1172, (94, 1172, 3.5, 1291781459, 4.260869565217392, False))]

In [10]:
def rec_func(data):
    pass