In [1]:
import findspark
findspark.init()

import sys
from pyspark import SparkContext, SparkConf
from scipy import spatial
import numpy as np

In [2]:
# use a function to load the movie names corresponding to the movie IDs
def loadMovieNames():
    movieNames = {}
    with open('ml-100k\u.item') as file:
        for line in file:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1].decode('ascii','ignore')
            
    return movieNames

In [3]:
# some other functions that we will apply to the various RDDs we create along the way. The function names are self explanatory
def filterDuplicates((userid, pairs)):
    movie1 = pairs[0][0]
    movie2 = pairs[1][0]
    return movie1 < movie2
            
    
def similarityFunc(list_of_ratings):
    arr = np.array(list_of_ratings)
    # the method gives the distance. 1 - the value is the similarity
    sim = 1-spatial.distance.cosine(arr[:,0], arr[:,1])
    num_of_pairs = len(list_of_ratings)
    
    return (float(sim), num_of_pairs)    
 
def makeItemPairs((userid, pairs)):
    (movie1, rating1) = pairs[0]
    (movie2, rating2) = pairs[1]
    
    return ((movie1,movie2),(rating1,rating2))

In [4]:
# the dataset has 100,000 records. So we will be using all the cores available in the computer
# by setting the setMaster argument to 'local[*]'
conf = SparkConf().setMaster('local[*]').setAppName('MovieSimi_ver1')
sc = SparkContext(conf = conf)

# load the movie names dataset into a large dictionary called names
names = loadMovieNames()

data = sc.textFile('.\ml-100k\u.data')

# map ratings in the form of userid as key and (movieid, rating) as value

user_ratings = data.map(lambda x: x.split('\t')).map(lambda parts: (int(parts[0]), (int(parts[1]), float(parts[2]))))

# compute a list of movie pairs with rating pairs from all users who have rated that pair of movies
# this will have each record in the form userid as key and  ((movieid1, rating1), (movieid2, rating2)) as the value

movie_ratings = user_ratings.join(user_ratings)

# removing duplicates...since a join could result in both movies being same and all combinations of the same movie pair in
# different order. We will only keep the movies where the first id is less than the second id. 

movie_ratings_clean = movie_ratings.filter(filterDuplicates)

# the records are still in the form userid as key and  ((movieid1, rating1), (movieid2, rating2)) as the value
# convert this rdd to the form (movieid1, movieid2) as the key and (rating1,rating2) as the value

movie_pairs = movie_ratings_clean.map(makeItemPairs)

# Now we will group by movie pairs to find all available pairs of ratings for each unique movie pair
movie_pairs_group = movie_pairs.groupByKey().map(lambda (x,y) : (x, list(y)))

# now for each movie pair as the key, the corresponding value is a list of rating pairs collected from all users. We will 
# consider each rating pair as elements from two vectors and essentially compute the similarity of the two vectors
# each vector being the collection of ratings

movie_pair_simi = movie_pairs_group.mapValues(similarityFunc).persist() #we cache this rdd as this will be used later

# each record is now of the form (movieid1, movieid2) as key and (similarity, number of rating pairs) as the value
print movie_pair_simi.take(10)

[((197, 1097), (0.9758729093599599, 7)), ((42, 364), (0.9093486560398836, 18)), ((773, 1409), (1.0, 1)), ((273, 617), (0.9652953599007105, 7)), ((372, 974), (1.0, 1)), ((789, 865), (0.9897475249773018, 3)), ((496, 1314), (0.976416832356179, 4)), ((246, 1008), (0.9449324793284705, 18)), ((856, 1006), (0.9686648999069224, 10)), ((747, 795), (0.8172062695283986, 6))]


In [5]:
# now we will get the similar movies for the user provided movie id
# Let's choose movie ID 50 which is a Star Wars movie

user_choice = 50
num_of_reco = 20 # number of similar movie suggestions to be sent to the output 
    
sim_thresh = 0.95 # similar to the chosen movie by atleast this amount
num_of_ratings = 50 # to select the similar movies which have atleast this number of ratings
    
# filter the movies which satisfy the given criteria

results = movie_pair_simi.filter( lambda (pair, (sim,num)) : (user_choice in  pair) and (sim >= sim_thresh) and\
                                 (num >= num_of_ratings))

results_final  = results.map(lambda (x,y) : (y,x)).sortByKey(ascending = False).take(num_of_reco)


print 'Top {} similar movies for the movie:  {}\n'.format(num_of_reco, names[user_choice])
for val in results_final:
    ((sim,num),pair) = val
    similarMovies = pair[0]
    if similarMovies==user_choice:
        similarMovies=  pair[1]
        
    print names[similarMovies] + ' with score: {:.2} from {} ratings'.format(sim, num)   

Top 20 similar movies for the movie:  Star Wars (1977)

Empire Strikes Back, The (1980) with score: 0.99 from 345 ratings
Return of the Jedi (1983) with score: 0.99 from 480 ratings
Raiders of the Lost Ark (1981) with score: 0.98 from 380 ratings
20,000 Leagues Under the Sea (1954) with score: 0.98 from 68 ratings
12 Angry Men (1957) with score: 0.98 from 109 ratings
Close Shave, A (1995) with score: 0.98 from 92 ratings
African Queen, The (1951) with score: 0.98 from 138 ratings
Sting, The (1973) with score: 0.98 from 204 ratings
Wrong Trousers, The (1993) with score: 0.97 from 103 ratings
Wallace & Gromit: The Best of Aardman Animation (1996) with score: 0.97 from 58 ratings
Indiana Jones and the Last Crusade (1989) with score: 0.97 from 304 ratings
North by Northwest (1959) with score: 0.97 from 156 ratings
Philadelphia Story, The (1940) with score: 0.97 from 87 ratings
Bridge on the River Kwai, The (1957) with score: 0.97 from 145 ratings
Casablanca (1942) with score: 0.97 from 214

In [10]:
# We do see some rather irrelevant movies like "Usual Suspects" and "Glory" which are not science fiction and definitely not
# related to space and technology. In the next part we will try to improve the current algorithm by incorporating more 
# information