In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [2]:
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
movies = pd.read_csv('./ml-latest-small/movies.csv')
links = pd.read_csv('./ml-latest-small/links.csv')
movies = pd.merge(movies, links)
movie_ratings = pd.merge(movies, ratings)

In [3]:
movie_stats = movie_ratings.groupby('movieId').agg({'rating': [np.mean]})
movie_stats.head(5)

Unnamed: 0_level_0,rating
Unnamed: 0_level_1,mean
movieId,Unnamed: 1_level_2
1,3.87247
2,3.401869
3,3.161017
4,2.384615
5,3.267857


In [4]:
ratings = pd.merge(ratings, links)
ratings_matrix = ratings.pivot_table(index=['movieId'],
                                     columns=['userId'],
                                     values='rating').reset_index(drop=True)
ratings_matrix.fillna( 0, inplace = True )

In [5]:
movie_similarity = 1 - pairwise_distances( ratings_matrix.as_matrix(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix = pd.DataFrame( movie_similarity )
#ratings_matrix.head(5)

In [6]:
print(ratings_matrix.tail())

          0         1     2     3     4         5         6     7     8     \
9061  0.079755  0.137594   0.0   0.0   0.0  0.000000  0.000000   0.0   0.0   
9062  0.079755  0.068797   0.0   0.0   0.0  0.111103  0.000000   0.0   0.0   
9063  0.000000  0.000000   0.0   0.0   0.0  0.000000  0.000000   0.0   0.0   
9064  0.000000  0.000000   0.0   0.0   0.0  0.000000  0.000000   0.0   0.0   
9065  0.055829  0.000000   0.0   0.0   0.0  0.061724  0.079399   0.0   0.0   

          9     ...   9056  9057  9058  9059  9060  9061  9062  9063  9064  \
9061  0.000000  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
9062  0.076835  ...    0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   
9063  0.000000  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   
9064  0.000000  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   0.0   
9065  0.000000  ...    1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   

      9065  
9061   0.0  
9062   0.0  
9063   0.0  
9064   0.0

In [6]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.Movielens
collection = db.movies

In [12]:
try:
    for movieId, title, new_id in zip(movies['movieId'], movies['title'], movies['new_id']):
        inp = movies[movies['movieId'] == movieId].index.tolist()
        inp = inp[0]
        similarity = []
        if inp < 9066:
            movies['similarity'] = ratings_matrix.iloc[inp]
            for similar_id, similar_title in zip(movies.sort_values(["similarity"], ascending = False )['new_id'][0:10],movies.sort_values(["similarity"], ascending = False )['title'][0:10]):
                similarity.append((similar_id,similar_title))
        if movieId in movie_stats['rating']['mean'].keys():
            mean = movie_stats['rating']['mean'][movieId]
        else:
            mean = 0
        movie_obj = {"id": new_id,
                     "name": title,
                     "average": mean,
                     "similar": similarity,
                     "posts": []}
        #collection.find_one({"id": "imdb_Id"}
        #collection.update(movie_obj)
        collection.update_one({'id': new_id}, {'$set':{'similar':similarity}})
except(e):
    print(e.traceback())

In [None]:
try:
    imdbId = 114709
    inp = movies[movies['imdbId'] == imdbId].index
    movies['similarity'] = ratings_matrix.iloc[inp]
    similarity = []
    for i in movies.sort_values(["similarity"], ascending = False )['title'][1:10]:
        similarity.append(i)
    print(similarity)
    
except:
    print("Sorry, the movie is not in the database!")

In [13]:
new_client = MongoClient('mongodb://root:xf4o3Bt3Gcd7cUMl@movies-shard-00-00-23ix3.mongodb.net:27017,movies-shard-00-01-23ix3.mongodb.net:27017,movies-shard-00-02-23ix3.mongodb.net:27017/test?ssl=true&replicaSet=Movies-shard-0&authSource=admin')
new_db = new_client.test

In [16]:
new_collection = new_db.movieinfos
new_collection.find_one()

{'__v': 0,
 '_id': ObjectId('5a332ffa695f461b408b2d6d'),
 'average': 3.8724696356275303,
 'createdAt': datetime.datetime(2017, 12, 15, 2, 14, 18, 257000),
 'imdbId': 'tt0114709',
 'movieId': 1,
 'posts': [],
 'similar': [['tt0120363', 'Toy Story 2 (1999)'],
  ['tt0076759', 'Star Wars: Episode IV - A New Hope (1977)'],
  ['tt0109830', 'Forrest Gump (1994)'],
  ['tt0116629', 'Independence Day (a.k.a. ID4) (1996)'],
  ['tt0107048', 'Groundhog Day (1993)'],
  ['tt0088763', 'Back to the Future (1985)'],
  ['tt0107290', 'Jurassic Park (1993)'],
  ['tt0126029', 'Shrek (2001)'],
  ['tt0086190', 'Star Wars: Episode VI - Return of the Jedi (1983)'],
  ['tt0110912', 'Pulp Fiction (1994)']],
 'title': 'Toy Story (1995)',
 'updatedAt': datetime.datetime(2017, 12, 15, 2, 14, 18, 257000)}

In [17]:
try:
    for movieId, title, new_id in zip(movies['movieId'], movies['title'], movies['new_id']):
        inp = movies[movies['movieId'] == movieId].index.tolist()
        inp = inp[0]
        similarity = []
        if inp < 9066:
            movies['similarity'] = ratings_matrix.iloc[inp]
            for similar_id, similar_title in zip(movies.sort_values(["similarity"], ascending = False )['new_id'][0:10],movies.sort_values(["similarity"], ascending = False )['title'][0:10]):
                similarity.append((similar_id,similar_title))
        if movieId in movie_stats['rating']['mean'].keys():
            mean = movie_stats['rating']['mean'][movieId]
        else:
            mean = 0
        '''
        movie_obj = {"id": new_id,
                     "name": title,
                     "average": mean,
                     "similar": similarity,
                     "posts": []}
        '''
        #collection.find_one({"id": "imdb_Id"}
        #collection.update(movie_obj)
        new_collection.update_one({'imdbId': new_id}, {'$set':{'similar':similarity, 'average': mean}})
except(e):
    print(e.traceback())