In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
ratings = pd.read_csv('train.csv')
ratings.head()

Unnamed: 0,userID,movieID,rating
0,4490,2109,4
1,5839,3471,4
2,5382,150,3
3,1262,1237,5
4,6005,2273,4


In [3]:
global_mean = ratings.rating.mean()

user_means = ratings.groupby('userID').rating.mean()
movie_means = ratings.groupby('movieID').rating.mean()

ratings = ratings.merge(pd.DataFrame(user_means), on='userID', how='left')
ratings = ratings.merge(pd.DataFrame(movie_means), on='movieID', how='left')
ratings.columns = ['userID', 'movieID', 'rating', 'user_mean', 'movie_mean']

movie_pivot = ratings.pivot(index="movieID", columns= "userID", values="rating")
movie_scaled_df = movie_pivot - movie_pivot.mean(axis=1).values.reshape(-1, 1)
movie_scaled_df.fillna(0, inplace=True)
movie_similiarities = cosine_similarity(movie_scaled_df,movie_scaled_df)
movie_similarities_df = pd.DataFrame(movie_similiarities,columns=movie_pivot.index,index=movie_pivot.index)

user_pivot = ratings.pivot(index="userID", columns= "movieID", values="rating")
user_scaled_df = user_pivot - user_pivot.mean(axis=1).values.reshape(-1, 1)
user_scaled_df.fillna(0, inplace=True)
user_similiarities = cosine_similarity(user_scaled_df,user_scaled_df)
user_similarities_df = pd.DataFrame(user_similiarities,columns=user_pivot.index,index=user_pivot.index)

In [4]:
def predict_rating(user_id, movie_id):
    top10_movies = movie_similarities_df.loc[movie_id,ratings.loc[ratings.userID == user_id].movieID.to_list()].sort_values(ascending=False)[1:11].index.to_list()
    top10_movies_mean = ratings[ratings.movieID.isin(top10_movies)].drop_duplicates(subset="movieID",keep="first").movie_mean.mean()
    
    if len(ratings[(ratings.movieID.isin(top10_movies)) & (ratings.userID == user_id)]) > 0:
        users_top10_ratings_mean = ratings[(ratings.movieID.isin(top10_movies)) & (ratings.userID == user_id)].rating.mean()
    else:
        users_top10_ratings_mean = ratings[ratings.userID == user_id].rating.mean()
    
    top10_users = user_similarities_df.loc[user_id].sort_values(ascending=False)[1:11].index.to_list()
    top10_users_mean = ratings[(ratings.userID.isin(top10_users)) & (ratings.movieID.isin(top10_movies))].rating.mean()
      
    predicted_rating = ((top10_movies_mean + users_top10_ratings_mean + top10_users_mean) / 3) + movie_scaled_df[user_id].mean()

    return predicted_rating

In [5]:
test = ratings[:10000]
test = test[["userID","movieID"]]

preds = []
for user,movie in test.itertuples(index=False):
    preds.append(predict_rating(user,movie))

In [6]:
for i in range(len(preds)): 
    if not preds[i] <= 5 or not preds[i] >= 1:
        preds[i] = global_mean

In [7]:
for i in range(len(preds)): 
    if preds[i] > 5:
        preds[i] = 5
    elif preds[i] < 1:
        preds[i] = 1

In [8]:
mean_squared_error(preds,ratings[:10000].rating.to_list(),squared=False)

0.8883906325542718

In [9]:
def predict_rating(user_id, movie_id):
    top10_movies = movie_similarities_df.loc[movie_id,ratings.loc[ratings.userID == user_id].movieID.to_list()].sort_values(ascending=False)[1:11].index.to_list()
    top10_movies_mean = ratings[ratings.movieID.isin(top10_movies)].drop_duplicates(subset="movieID",keep="first").movie_mean.mean()
    
    if len(ratings[(ratings.movieID.isin(top10_movies)) & (ratings.userID == user_id)]) > 0:
        users_top10_ratings_mean = ratings[(ratings.movieID.isin(top10_movies)) & (ratings.userID == user_id)].rating.mean()
    else:
        users_top10_ratings_mean = ratings[ratings.userID == user_id].rating.mean()
    
    top10_users = user_similarities_df.loc[movie_id,ratings.loc[ratings.userID == user_id].movieID.to_list()].sort_values(ascending=False)[1:11].index.to_list()
    top10_users_mean = ratings[(ratings.userID.isin(top10_users)) & (ratings.movieID.isin(top10_movies))].rating.mean()
      
    predicted_rating = (top10_movies_mean * 0.2) + (users_top10_ratings_mean * 0.4) + (top10_users_mean * 0.4) + movie_scaled_df[user_id].mean()

    return predicted_rating

In [10]:
test = ratings[:10000]
test = test[["userID","movieID"]]

preds = []
for user,movie in test.itertuples(index=False):
    preds.append(predict_rating(user,movie))

In [11]:
for i in range(len(preds)): 
    if preds[i] > 5:
        preds[i] = 5
    elif preds[i] < 1:
        preds[i] = 1

In [12]:
for i in range(len(preds)): 
    if not preds[i] <= 5 or not preds[i] >= 1:
        preds[i] = global_mean

In [13]:
mean_squared_error(preds,ratings[:10000].rating.to_list(),squared=False)

0.9281127636512154

In [14]:
def predict_rating(user_id, movie_id):
    top5_movies = movie_similarities_df.loc[movie_id,ratings.loc[ratings.userID == user_id].movieID.to_list()].sort_values(ascending=False)[1:6].index.to_list()
    top5_movies_mean = ratings[ratings.movieID.isin(top5_movies)].drop_duplicates(subset="movieID",keep="first").movie_mean.mean()
    
    if len(ratings[(ratings.movieID.isin(top5_movies)) & (ratings.userID == user_id)]) > 0:
        users_top5_ratings_mean = ratings[(ratings.movieID.isin(top5_movies)) & (ratings.userID == user_id)].rating.mean()
    else:
        users_top5_ratings_mean = ratings[ratings.userID == user_id].rating.mean()
    
    top5_users = user_similarities_df.loc[user_id].sort_values(ascending=False)[1:5].index.to_list()
    top5_users_mean = ratings[(ratings.userID.isin(top5_users)) & (ratings.movieID.isin(top5_movies))].rating.mean()
      
    predicted_rating = ((top5_movies_mean + users_top5_ratings_mean + top5_users_mean) / 3) + movie_scaled_df[user_id].mean()

    return predicted_rating

In [15]:
test = ratings[:2000]
test = test[["userID","movieID"]]

preds = []
for user,movie in test.itertuples(index=False):
    preds.append(predict_rating(user,movie))

In [16]:
for i in range(len(preds)): 
    if preds[i] > 5:
        preds[i] = 5
    elif preds[i] < 1:
        preds[i] = 1
        
for i in range(len(preds)): 
    if not preds[i] <= 5 or not preds[i] >= 1:
        preds[i] = global_mean

In [17]:
mean_squared_error(preds,ratings[:2000].rating.to_list(),squared=False)

0.8995867738682565

In [18]:
def predict_rating(user_id, movie_id):
    top5_movies = movie_similarities_df.loc[movie_id,ratings.loc[ratings.userID == user_id].movieID.to_list()].sort_values(ascending=False)[1:11].index.to_list()
    top5_movies_mean = ratings[ratings.movieID.isin(top5_movies)].drop_duplicates(subset="movieID",keep="first").movie_mean.mean()
    
    if len(ratings[(ratings.movieID.isin(top5_movies)) & (ratings.userID == user_id)]) > 0:
        users_top5_ratings_mean = ratings[(ratings.movieID.isin(top5_movies)) & (ratings.userID == user_id)].rating.mean()
    else:
        users_top5_ratings_mean = ratings[ratings.userID == user_id].rating.mean()
    
    top5_users = user_similarities_df.loc[user_id].sort_values(ascending=False)[1:6].index.to_list()
    top5_users_mean = ratings[(ratings.userID.isin(top5_users)) & (ratings.movieID.isin(top5_movies))].rating.mean()
      
    predicted_rating = ((top5_movies_mean + users_top5_ratings_mean + top5_users_mean) / 3) + movie_scaled_df[user_id].mean()

    return predicted_rating

In [19]:
test = ratings[:2000]
test = test[["userID","movieID"]]

preds = []
for user,movie in test.itertuples(index=False):
    preds.append(predict_rating(user,movie))

In [20]:
for i in range(len(preds)): 
    if preds[i] > 5:
        preds[i] = 5
    elif preds[i] < 1:
        preds[i] = 1
        
for i in range(len(preds)): 
    if not preds[i] <= 5 or not preds[i] >= 1:
        preds[i] = global_mean

In [21]:
mean_squared_error(preds,ratings[:2000].rating.to_list(),squared=False)

0.8887136590855075