In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [2]:
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
movies = pd.read_csv('./ml-latest-small/movies.csv')
links = pd.read_csv('./ml-latest-small/links.csv')
movies = pd.merge(movies, links)
movie_ratings = pd.merge(movies, ratings)

In [3]:
movie_stats = movie_ratings.groupby('movieId').agg({'rating': [np.mean]})
movie_stats.head(5)

Unnamed: 0_level_0,rating
Unnamed: 0_level_1,mean
movieId,Unnamed: 1_level_2
1,3.87247
2,3.401869
3,3.161017
4,2.384615
5,3.267857


In [4]:
ratings = pd.merge(ratings, links)
ratings_matrix = ratings.pivot_table(index=['movieId'],columns=['userId'],values='rating').reset_index(drop=True)
ratings_matrix.fillna( 0, inplace = True )
ratings_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
movie_similarity = 1 - pairwise_distances( ratings_matrix.as_matrix(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix = pd.DataFrame( movie_similarity )
ratings_matrix.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,0.0,0.394511,0.306516,0.133614,0.245102,0.377086,0.278629,0.063031,0.117499,0.310689,...,0.055829,0.031902,0.079755,0.079755,0.079755,0.079755,0.079755,0.0,0.0,0.055829
1,0.394511,0.0,0.217492,0.164651,0.278476,0.222003,0.207299,0.223524,0.113669,0.418124,...,0.0,0.055038,0.068797,0.082557,0.082557,0.137594,0.068797,0.0,0.0,0.0
2,0.306516,0.217492,0.0,0.177012,0.370732,0.247499,0.435648,0.127574,0.306717,0.191255,...,0.0,0.0,0.0,0.116226,0.116226,0.0,0.0,0.0,0.0,0.0
3,0.133614,0.164651,0.177012,0.0,0.179556,0.072518,0.184626,0.501513,0.25463,0.111447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.245102,0.278476,0.370732,0.179556,0.0,0.272645,0.388476,0.194113,0.367941,0.246846,...,0.0,0.176845,0.0,0.117897,0.117897,0.0,0.0,0.0,0.0,0.0


In [6]:
print(ratings_matrix.tail())

          0         1     2     3     4         5         6     7     8     \
9061  0.079755  0.137594   0.0   0.0   0.0  0.000000  0.000000   0.0   0.0   
9062  0.079755  0.068797   0.0   0.0   0.0  0.111103  0.000000   0.0   0.0   
9063  0.000000  0.000000   0.0   0.0   0.0  0.000000  0.000000   0.0   0.0   
9064  0.000000  0.000000   0.0   0.0   0.0  0.000000  0.000000   0.0   0.0   
9065  0.055829  0.000000   0.0   0.0   0.0  0.061724  0.079399   0.0   0.0   

          9     ...   9056  9057  9058  9059  9060  9061  9062  9063  9064  \
9061  0.000000  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
9062  0.076835  ...    0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   
9063  0.000000  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   
9064  0.000000  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   0.0   
9065  0.000000  ...    1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   

      9065  
9061   0.0  
9062   0.0  
9063   0.0  
9064   0.0

In [7]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.Movielens
collection = db.movies

In [13]:
try:
    for movieId, title, new_id in zip(movies['movieId'], movies['title'], movies['new_id']):
        inp = movies[movies['movieId'] == movieId].index.tolist()
        inp = inp[0]
        similarity = []
        if inp < 9066:
            movies['similarity'] = ratings_matrix.iloc[inp]
            for i in movies.sort_values(["similarity"], ascending = False )['new_id'][0:10]:
                similarity.append(i)
        if movieId in movie_stats['rating']['mean'].keys():
            mean = movie_stats['rating']['mean'][movieId]
        else:
            mean = 0
        print(title)
        print(similarity)
        movie_obj = {"id": new_id,
                     "name": title,
                     "average": mean,
                     "similar": similarity,
                     "posts": []}
        collection.insert_one(movie_obj)
except(e):
    print(e.traceback())

Toy Story (1995)
['tt0120363', 'tt0076759', 'tt0109830', 'tt0116629', 'tt0107048', 'tt0088763', 'tt0107290', 'tt0126029', 'tt0086190', 'tt0110912']
Jumanji (1995)
['tt0107614', 'tt0110357', 'tt0111070', 'tt0110475', 'tt0101414', 'tt0103639', 'tt0109040', 'tt0099653', 'tt0107290', 'tt0111257']
Grumpier Old Men (1995)
['tt0117333', 'tt0117108', 'tt0116253', 'tt0117218', 'tt0114319', 'tt0116629', 'tt0117500', 'tt0117998', 'tt0116213', 'tt0116731']
Waiting to Exhale (1995)
['tt0112302', 'tt0109361', 'tt0110971', 'tt0162983', 'tt0107566', 'tt0111333', 'tt0113199', 'tt0115742', 'tt0113967', 'tt0112435']
Father of the Bride Part II (1995)
['tt0117979', 'tt0116253', 'tt0117333', 'tt0116213', 'tt0110997', 'tt0116130', 'tt0117913', 'tt0114319', 'tt0114924', 'tt0113862']
Heat (1995)
['tt0117500', 'tt0113627', 'tt0114746', 'tt0116213', 'tt0116629', 'tt0115759', 'tt0116282', 'tt0117998', 'tt0117060', 'tt0114369']
Sabrina (1995)
['tt0117333', 'tt0117979', 'tt0116213', 'tt0116253', 'tt0117998', 'tt01

In [None]:
try:
    imdbId = 114709
    inp = movies[movies['imdbId'] == imdbId].index
    movies['similarity'] = ratings_matrix.iloc[inp]
    similarity = []
    for i in movies.sort_values(["similarity"], ascending = False )['title'][1:10]:
        similarity.append(i)
    print(similarity)
    
except:
    print("Sorry, the movie is not in the database!")