In [3]:
import pandas as pd
import numpy as np

In [4]:
class MovieLensData :
    
    def __init__(self):
        movie_cols = ['movie_id', 'title', 'genres']
        movie_data = pd.read_table('../data/1M/movies.dat', sep='::', names=movie_cols, usecols=range(5), header=None, engine='python')
        
        rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
        rating_data = pd.read_table('../data/1M/ratings.dat', sep='::', names=rating_cols, header=None, engine='python')
        
        movie_rating_data = pd.merge(movie_data, rating_data)
        self.allratings = movie_rating_data


In [5]:
movielens = MovieLensData()

In [7]:
movielens.allratings[:10]

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474
5,1,Toy Story (1995),Animation|Children's|Comedy,18,4,978154768
6,1,Toy Story (1995),Animation|Children's|Comedy,19,5,978555994
7,1,Toy Story (1995),Animation|Children's|Comedy,21,3,978139347
8,1,Toy Story (1995),Animation|Children's|Comedy,23,4,978463614
9,1,Toy Story (1995),Animation|Children's|Comedy,26,3,978130703


In [179]:
class MovieLensHelper :
    
    def __init__(self):
        pass
    
    @staticmethod
    def getRatingCountOfMovies(movielens):
        return movielens.allratings['title'].value_counts()
    
    @staticmethod
    def getMoviesFilteredByRatingCount(movielens, rating_greater_than=1):
        allratings_indexed = movielens.allratings.set_index('title')
        allratings_filtered = allratings_indexed.ix[movielens.allratings['title'].value_counts() > rating_greater_than]
        
        movie_titles = allratings_filtered.index.unique()
        
        allratings_new = allratings_filtered.drop(['timestamp','genres','movie_id'],axis=1)
        
        return movie_titles, allratings_new
      
    @staticmethod
    def getAverageRatingsOfUsersByMovieId(moveielens):
        avg_user_rating = movielens.allratings.groupby('user_id').mean().drop(['movie_id','timestamp'],axis=1)
        avg_user_rating.columns = ['avg_rating']
        
        return avg_user_rating
    
    @staticmethod
    def mergeDataSets(dataset1, dataset2, index_column=''):
        merged_dataset = pd.merge(dataset1.reset_index(),dataset2.reset_index())
        merged_dataset = merged_dataset.set_index(index_column)
        
        return merged_dataset
    
    @staticmethod
    def setIndex(dataset, index_columns=[]):
        ds = dataset.reset_index().set_index(index_columns)
        ds.index
        
        return ds

In [180]:
MovieLensHelper.getRatingCountOfMovies(movielens)

American Beauty (1999)                                                   3428
Star Wars: Episode IV - A New Hope (1977)                                2991
Star Wars: Episode V - The Empire Strikes Back (1980)                    2990
Star Wars: Episode VI - Return of the Jedi (1983)                        2883
Jurassic Park (1993)                                                     2672
Saving Private Ryan (1998)                                               2653
Terminator 2: Judgment Day (1991)                                        2649
Matrix, The (1999)                                                       2590
Back to the Future (1985)                                                2583
Silence of the Lambs, The (1991)                                         2578
Men in Black (1997)                                                      2538
Raiders of the Lost Ark (1981)                                           2514
Fargo (1996)                                                    

In [181]:
movie_titles, ratings_filtered = MovieLensHelper.getMoviesFilteredByRatingCount(movielens,100)
ratings_filtered[:10]

Unnamed: 0_level_0,user_id,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1,5
Toy Story (1995),6,4
Toy Story (1995),8,4
Toy Story (1995),9,5
Toy Story (1995),10,5
Toy Story (1995),18,4
Toy Story (1995),19,5
Toy Story (1995),21,3
Toy Story (1995),23,4
Toy Story (1995),26,3


In [182]:
movie_titles

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Meet the Parents (2000)', 'Requiem for a Dream (2000)',
       'Contender, The (2000)'], dtype=object)

In [183]:
avg_user_rating = MovieLensHelper.getAverageRatingsOfUsersByMovieId(movielens)
avg_user_rating[:10]

Unnamed: 0_level_0,avg_rating
user_id,Unnamed: 1_level_1
1,4.188679
2,3.713178
3,3.901961
4,4.190476
5,3.146465
6,3.901408
7,4.322581
8,3.884892
9,3.735849
10,4.114713


In [184]:
merged_ratings = MovieLensHelper.mergeDataSets(ratings_filtered, avg_user_rating, 'title')
merged_ratings[:10]

Unnamed: 0_level_0,user_id,rating,avg_rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story (1995),1,5,4.188679
Pocahontas (1995),1,5,4.188679
Apollo 13 (1995),1,5,4.188679
Star Wars: Episode IV - A New Hope (1977),1,4,4.188679
Schindler's List (1993),1,5,4.188679
"Secret Garden, The (1993)",1,4,4.188679
Aladdin (1992),1,4,4.188679
Snow White and the Seven Dwarfs (1937),1,4,4.188679
Beauty and the Beast (1991),1,5,4.188679
Fargo (1996),1,4,4.188679


In [191]:
rs = MovieLensHelper.setIndex(merged_ratings, ['title','user_id'])
rs.ix['Fargo (1996)'][:10]

Unnamed: 0_level_0,rating,avg_rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,4.188679
8,5,3.884892
9,4,3.735849
23,4,3.315789
28,4,3.757009
36,5,4.19943
48,4,3.068562
56,5,3.970149
65,3,4.347107
76,5,4.172414


In [190]:
class SimilarityFinder :
    
    def __init__(self):
        pass
    
    @staticmethod
    def join(rs, movie1_title, movie2_title):
        movie1 = rs.ix[movie1_title].reset_index()
        movie2 = rs.ix[movie2_title].reset_index()
        
        mm = pd.merge(movie1, movie2, on='user_id').drop('user_id', axis=1)
        
        return mm
    
    @staticmethod
    def findPearsonSimilarityValue(rs, movie1_title, movie2_title):
        data = SimilarityFinder.join(rs, movie1_title, movie2_title)
        
        data['rating_x'] = data['rating_x'].astype('float32')
        data['rating_y'] = data['rating_y'].astype('float32')
        
        pcorr = data.corr(method='pearson')
        
        return pcorr.ix['rating_x']['rating_y']
    
    @staticmethod
    def findPairSimilarityValue(rs, movie1_title, movie2_title, verbose=False):
        data = SimilarityFinder.join(rs, movie1_title, movie2_title)
        
        num  = (data['rating_x'] - data['avg_rating_x']).dot((data['rating_y'] - data['avg_rating_y']))
        den1 = (data['rating_x'] - data['avg_rating_x']).dot((data['rating_x'] - data['avg_rating_x']))
        den2 = (data['rating_y'] - data['avg_rating_y']).dot((data['rating_y'] - data['avg_rating_y']))

        sim = num / np.sqrt(den1*den2)
        if sim > 1.0: sim = 1.0 
        return sim
    
    @staticmethod
    def findBestMatches(rs, movie_title, movie_titles, ntop=100, similarity_function_name='pearson'):
        score = []
        
        for movie_title2 in movie_titles:
            if movie_title == movie_title2: 
                continue
            if similarity_function_name == 'pearson':
                corr = SimilarityFinder.findPearsonSimilarityValue(rs, movie_title, movie_title2)
            if similarity_function_name == 'pair':
                corr = SimilarityFinder.findPairSimilarityValue(rs, movie_title, movie_title2)
            if pd.isnull(corr): 
                continue
            
            score += [(corr, movie_title2)] 
            
        score.sort()
        score.reverse()
        
        return score[0:ntop]
    

In [138]:
value = SimilarityFinder.findPearsonSimilarityValue(rs, 'Fargo (1996)', 'Saving Private Ryan (1998)')
value

0.095240647626515379

In [139]:
value = SimilarityFinder.findPairSimilarityValue(rs, 'Fargo (1996)', 'Saving Private Ryan (1998)')
value

0.38085497648191285

In [168]:
time score = SimilarityFinder.findBestMatches(rs, 'Deer Hunter, The (1978)', movie_titles)

Wall time: 1min 15s


In [169]:
score

[(0.68519671766984935, 'Bank Dick, The (1940)'),
 (0.61754996298072617, 'Ninotchka (1939)'),
 (0.58451876847105411, 'Carnival of Souls (1962)'),
 (0.57004087035588757, 'Return of the Fly (1959)'),
 (0.56932079497565791, 'Killer, The (Die xue shuang xiong) (1989)'),
 (0.53851196036005333, 'Chungking Express (1994)'),
 (0.52656766771370478, 'Crucible, The (1996)'),
 (0.52629989901984342, 'Murder, My Sweet (1944)'),
 (0.52556544432949681, "All the King's Men (1949)"),
 (0.51608782605151282, 'Brokedown Palace (1999)'),
 (0.51584122112135078, 'Marty (1955)'),
 (0.51576899775442098, 'Fighting Seabees, The (1944)'),
 (0.50709719633499062, 'Girlfight (2000)'),
 (0.50672844621884661, 'Invisible Man, The (1933)'),
 (0.50664915892853168, "Wes Craven's New Nightmare (1994)"),
 (0.50065652153557283, 'Taking of Pelham One Two Three, The (1974)'),
 (0.49793634606326415, 'True Crime (1995)'),
 (0.4903332116500338, 'Suspicion (1941)'),
 (0.48720657037290921, 'Outside Providence (1999)'),
 (0.4847424699