# Tutorial: Taming Big Data With Apache Spark and Python - Hands On!
## Exercise 8 - Movie Similarity (Collaborative Filtering)

### Setup

FindSpark

This will circumvent many issues with your system finding spark

In [1]:
import findspark
findspark.init('c:/users/andy/spark')

Load Libraries

In [2]:
import sys
from pyspark import SparkConf, SparkContext
from math import sqrt

Set the file path

In [3]:
data_folder = "C:/Users/Andy/Dropbox/FactoryFloor/Repositories/Tutorial_Udemy_SparkPython/Course_Resources/ml-100k/"

Create the Spark Context

In [4]:
# configure your Spark context; master node is local machine
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities")

# create a spark context object
sc = SparkContext(conf = conf)

### Define Functions

In [5]:
def loadMovieNames():
    movieNames = {} # create a dict
    file_to_open = data_folder + "u.ITEM" #file path
    with open(file_to_open, encoding = 'ascii', errors = 'ignore') as f: # open file
        for line in f:
            fields = line.split('|') # break the lines
            movieNames[int(fields[0])] = fields[1] # create key-value
    return movieNames

In [6]:
def filterDuplicates(userRatings):
    ratings = userRatings[1] # the value ((movieID, rating), (movieID, rating))
    (movie1, rating1) = ratings[0] 
    (movie2, rating2) = ratings[1]
    return movie1 < movie2 # return only those entries where movieID 2 is greater than movieID 1

In [7]:
#Python 3 doesn't let you pass arond unpacked tuples,
# so we explicitly extract the ratings now.
def makePairs(userRatings):
    ratings = userRatings[1] # the value ((movieID, rating), (movieID, rating))
    (movie1, rating1) = ratings[0]
    (movie2, rating2) = ratings[1]
    return ((movie1, movie2), (rating1, rating2)) #format so its pair of movies and pair of ratings

In [8]:
def computeCosineSimilarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1
    
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)
    
    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))
        
    return (score, numPairs)

### The Program

Create a dictionary with movieID and movieNames.

In [10]:
print("\nLoading movie names...")
nameDict = loadMovieNames()


Loading movie names...


Bring in the movie ratings data.

In [11]:
data = sc.textFile(data_folder + "u.data")

For each entry in 'data' split into elements. For each entry return the first, second and third elements as int (0,1) and float(2). Map ratings to key / value pairs: user ID => movie ID, rating.

In [12]:
ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))

Emit every movie rated together by the same user. Self-join to find every combination.

In [13]:
joinedRatings = ratings.join(ratings)

The RDD consists of userID => ((movieID, rating), (movieID, rating))

Filter out duplicate pairs

In [14]:
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)

Now key by (movie1, movie2) pairs.

In [15]:
moviePairs = uniqueJoinedRatings.map(makePairs)

We now have (movie1, movie2) => (rating1, rating2). Collect all ratings for each movie pair.

In [16]:
moviePairRatings = moviePairs.groupByKey()

We now have (movie1, movie2) => (rating1, rating2), (rating1, rating2) ... We can now compute similarities.

In [18]:
moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).cache()

# Save the results if desired
#moviePairSimilarities.sortByKey()
#moviePairSimilarities.saveAsTextFile("movie-sims")

Extract similarities for the movie we care about that are "good".

In [19]:
if (len(sys.argv) > 1):
    
    scoreThreshold = 0.97
    coOccurenceThreshold = 50
    
    movieID = int(sys.argv[1])
    
    # Filter for movies with this sim that are "good" as defined by 
    # our quality thresholds above
    filteredResults = moviePairSimilarities.filter(lambda pairSim: \
                                                  (pairSim[0][0] == movieID or pairSim[0][1] == movieID) \
                                                  and pairSim[1][0] > scoreThreshold and pairsim[1][1] > coOccurenceThreshold)
    
    # Sort by quality score.
    results = filteredResults.map(lambda pairSim: (pairSim[1], pairSim[0])).sortByKey(ascending= False).take(10)
    
    print("Top 10 similar movies for " + nameDict[movieID])
    for result in results:
        (sim, pair) = result
        # Display the similarity result that isn't the movie we're looking at
        similarMovieID = pair[0]
        if (similarMovieID == movieID):
            similarMovieID == pair[1]
        print(nameDict[similarMovieID] + "\t score: " + str(sim[0]) + "\t strength: " + str(sim[1]))

ValueError: invalid literal for int() with base 10: '-f'