In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
movies = pd.read_csv('/home/maria/Django-Onboarding/recommendation/data/movies.csv')
ratings = pd.read_csv('/home/maria/Django-Onboarding/recommendation/data/ratings.csv')

In [4]:
dataset = pd.merge(movies, ratings, how='left', on='movieId')
dataset = dataset.fillna(0)

In [5]:
dataset.tail(6)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
105337,146878,Le Grand Restaurant (1966),Comedy,622.0,2.5,1447515000.0
105338,148238,A Very Murray Christmas (2015),Comedy,475.0,3.0,1451213000.0
105339,148626,The Big Short (2015),Drama,458.0,4.0,1452015000.0
105340,148626,The Big Short (2015),Drama,576.0,4.5,1451688000.0
105341,148626,The Big Short (2015),Drama,668.0,4.5,1451148000.0
105342,149532,Marco Polo: One Hundred Eyes (2015),(no genres listed),475.0,4.0,1451223000.0


In [6]:
dataset = dataset.drop(['timestamp'], axis=1)

In [7]:
# shape of the data
print('The total number of rows :', dataset.shape[0])
print('The total number of columns :', dataset.shape[1])

The total number of rows : 105343
The total number of columns : 5


In [8]:
# unique users
print('Total unique users in the dataset', dataset['userId'].nunique())

# unique movie
print('Total unique movie in the dataset', dataset['movieId'].nunique())

Total unique users in the dataset 669
Total unique movie in the dataset 10329


In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [10]:
train_data, test_data = train_test_split(dataset, test_size =.30, random_state=10)

print(f'Training set has {train_data.shape[0]} rows and {train_data.shape[1]} columns')
print(f'Testing set has {test_data.shape[0]} rows and {test_data.shape[1]} columns')

Training set has 73740 rows and 5 columns
Testing set has 31603 rows and 5 columns


# Popularity Based Model

    The goal of this recommendation model is to first choose the movies with the highest ratings received. 
    Once the popular movies have been extracted, I fetch the movies that havent been rated by the given userId yet.
    This model suggests those popular movies that have not been rated by an user.
    Presumably if a movie is not rated  by the user it hasnot been seen y the user as well
    
    ^^

In [11]:
class popularity_model():
    def __init__(self, train_data, test_data, user_id, movie_id):
        self.train_data = train_data
        self.test_data = test_data
        self.user_id = user_id
        self.movie_id = movie_id
        self.popularity_recommendation = None
        
    def fit(self):
        # counting movies with more ratings from users
        train_data_grouped = train_data.groupby([self.movie_id]).agg(self.user_id).count().reset_index()
        train_data_grouped.rename(columns = {'userId': 'score'},inplace=True)
    
        # sorting the score column of train_data_grouped in ascending order
        train_data_sort = train_data_grouped.sort_values(['score', self.movie_id], ascending = [0,1])
        
    
        # recommendation rank based upon score
        # ranking the highest score with 1 and onwards
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
    
        self.popularity_recommendation = train_data_sort
        
    def recommend(self, user_id, n=5):    
        user_recommendations = self.popularity_recommendation
        
        #getting movies that are rated by the given userId 
        movies_already_rated_by_user = self.train_data.loc[self.train_data[self.user_id] == user_id][self.movie_id]  
       
        # recommending the movies with the highest ratings that havent been rated by the given user
        # `~` refers to all the movieId that is not in the variable `movies_already_rated_by_user`
        user_recommendations = user_recommendations[~user_recommendations[self.movie_id].isin(movies_already_rated_by_user)]
        
        
        #Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id
    
        #Bringing the user_id column to the front
        cols = user_recommendations.columns.tolist()
        user_recommendations = user_recommendations[cols]
        return user_recommendations.head(n)
    

In [12]:
pr = popularity_model(train_data=train_data, test_data=test_data, user_id='userId', movie_id='movieId')

In [13]:
# the variable pr has all the highest rated movies
pr.fit()

In [14]:
result_pop_user1 = pr.recommend(628)
result_pop_user1

Unnamed: 0,movieId,score,Rank,user_id
249,296,229,1.0,628
304,356,212,3.0,628
410,480,205,5.0,628
453,527,175,8.0,628
1982,2571,175,9.0,628


In [15]:
# train_data.groupby('movieId').agg('userId').count().reset_index(name='userIdCount')