# Contents:



*   [About Data](https://www.kaggle.com/datasets/prajitdatta/movielens-100k-dataset)
*   [Splitting Data](#spli)
*   [Baseline Model](#base)
*   [Standard Mean](#smean)
*   [Weighted Mean](#wmean)
*   [Demographics](#demog)
*   [KNN](#knn)

# Libraries:

In [1]:
#!pip	install	scikit-surprise

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from	sklearn.metrics	import	mean_squared_error
from	sklearn.metrics.pairwise	import	cosine_similarity

from	surprise	import	Reader,	Dataset,	KNNBasic
from surprise.model_selection import cross_validate


import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Data:

In [3]:
#!unzip "/content/drive/MyDrive/archive.zip" -d "/content/drive/MyDrive/"

In [4]:
u_cols	=	['user_id',	'age',	'sex',	'occupation',	'zip_code']
users	=	pd.read_csv('/content/drive/MyDrive/ml-100k/u.user',	sep='|',	names=u_cols, encoding='latin-1')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
#Load	the	u.items	file	into	a	dataframe
i_cols	=	['movie_id',	'title'	,'release date','video release date',	'IMDb URL',	
'unknown',	'Action',	'Adventure',
	'Animation',	'Children\'s',	'Comedy',	'Crime',	'Documentary',	'Drama',	'Fantasy',
	'Film-Noir',	'Horror',	'Musical',	'Mystery',	'Romance',	'Sci-Fi',	'Thriller',	'War',	'Western']

movies	=	pd.read_csv('/content/drive/MyDrive/ml-100k/u.item',	sep='|',	names=i_cols,	encoding='latin-1')

movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
movies = movies[['movie_id',	'title']]
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
#Load	the	u.data	file	into	a	dataframe
r_cols	=	['user_id',	'movie_id',	'rating',	'timestamp']

ratings	=	pd.read_csv('/content/drive/MyDrive/ml-100k/u.data',	sep='\t',	names=r_cols, encoding='latin-1')

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
#Drop	the	timestamp	column
ratings	=	ratings.drop('timestamp',	axis=1)

<a name = 'spi'></a>
# Splitting Data:

In [9]:
X = ratings.copy()
y = ratings['user_id']

In [10]:
X_train,	X_test,	y_train,	y_test	=	train_test_split(X,	y,	test_size	=	0.25, stratify=y,	random_state=42)

In [11]:
def	rmse(y_true,	y_pred):
  return	np.sqrt(mean_squared_error(y_true,	y_pred))

In [12]:
r_matrix	=	X_train.pivot_table(values='rating',	index='user_id',	columns='movie_id')
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [13]:
def	score(cf_model):

  #Construct	a	list	of	user-movie	tuples	from	the	testing	dataset
  id_pairs	=	zip(X_test['user_id'],	X_test['movie_id'])

  #Predict	the	rating	for	every	user-movie	tuple
  y_pred	=	np.array([cf_model(user,	movie)	for	(user,	movie)	in	id_pairs])
  #Extract	the	actual	ratings	given	by	the	users	in	the	test	data
  y_true	=	np.array(X_test['rating'])

  #Return	the	final	RMSE	score
  return rmse(y_true,	y_pred)

<a name = 'base'></a>

# BaseLine Mode:
will simply give rating of 3 for each movie.

In [14]:
def	baseline(user_id,	movie_id):
  return	3.0

In [15]:
score(baseline)

1.2488234462885457

<a name = 'smean'></a>

# Standard Mean:
outputs the mean rating for the movies by all users who have rated it, and each user is assigned equal weight.

In [16]:
#User	Based	Collaborative	Filter	using	Mean	Ratings
def	cf_user_mean(user_id,	movie_id):
				
				#Check	if	movie_id	exists	in	r_matrix
				if	movie_id	in	r_matrix:
								#Compute	the	mean	of	all	the	ratings	given	to	the	movie
								mean_rating	=	r_matrix[movie_id].mean()
				
				else:
								#Default	to	a	rating	of	3.0	in	the	absence	of	any	information
								mean_rating	=	3.0
				
				return	mean_rating


In [17]:
#Compute	RMSE	for	the	Mean	model
score(cf_user_mean)

1.0300824802393536

The mean collaborative rcommender achieves lower rmse than the baseline model.

<a name = 'wmean'></a>

# Weighted Mean:
Giving similar users to the user in question more weight than other users whose ratings are not similar.

## computing similarity:

In [18]:
r_matrix_dummy = r_matrix.copy().fillna(0)

In [19]:
cosine_sim	=	cosine_similarity(r_matrix_dummy,	r_matrix_dummy)

In [20]:
cosine_sim	=	pd.DataFrame(cosine_sim,	index=r_matrix_dummy.index,columns=r_matrix_dummy.index)

## Building Model:

In [21]:
#User	Based	Collaborative	Filter	using	Weighted	Mean	Ratings
def	cf_user_wmean(user_id,	movie_id):

  #Check	if	movie_id	exists	in	r_matrix
  if	movie_id	in	r_matrix:
    #Get	the	similarity	scores	for	the	user	in	question	with	every	other	user
    sim_scores	=	cosine_sim[user_id]
    #Get	the	user	ratings	for	the	movie	in	question
    m_ratings	=	r_matrix[movie_id]
    #Extract	the	indices	containing	NaN	in	the	m_ratings	series
    idx	=	m_ratings[m_ratings.isna()].index
    #Drop	the	NaN	values	from	the	m_ratings	Series
    m_ratings	=	m_ratings.dropna()
    #Drop	the	corresponding	cosine	scores	from	the	sim_scores	series
    sim_scores	=	sim_scores.drop(idx)


    #Compute	the	final	weighted	mean
    wmean_rating	=	np.dot(sim_scores,	m_ratings)/	(sim_scores.sum() + 10e-7)

  else:
    wmean_rating	=	3.0

  return wmean_rating


In [22]:
score(cf_user_wmean)

1.0236623800413516

We get a slight improvement.

<a name = 'demog'></a>

# Adding User Demographics:
Testing if users of	the	same demographic	tend
to	have	similar	tastes.	

In [23]:
merged_df = pd.merge(X_train, users)
merged_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,862,177,4,25,M,executive,13820
1,862,416,3,25,M,executive,13820
2,862,1093,5,25,M,executive,13820
3,862,168,4,25,M,executive,13820
4,862,568,3,25,M,executive,13820


## Gender:

In [24]:
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()
gender_mean

movie_id  sex
1         F      3.797872
          M      3.888446
2         F      3.285714
          M      3.202703
3         F      2.916667
                   ...   
1677      F      3.000000
1679      M      3.000000
1680      M      2.000000
1681      M      3.000000
1682      M      3.000000
Name: rating, Length: 3047, dtype: float64

In [25]:
users = users.set_index('user_id')

In [26]:
#Gender	Based	Collaborative	Filter	using	Mean	Ratings
def	cf_gender(user_id,	movie_id):

  #Check	if	movie_id	exists	in	r_matrix	(or	training	set)
  if	movie_id	in	r_matrix:
    #Identify	the	gender	of	the	user
    gender	=	users.loc[user_id]['sex']
    
    #Check	if	the	gender	has	rated	the	movie
    if	gender	in	gender_mean[movie_id]:
      
      #Compute	the	mean	rating	given	by	that	gender	to	the	movie
      gender_rating	=	gender_mean[movie_id][gender]

    else:
      gender_rating	=	3.0

  else:
    #Default	to	a	rating	of	3.0	in	the	absence	of	any	information
    gender_rating	=	3.0

  return	gender_rating

In [27]:
score(cf_gender)

1.0392906999935203

It performs worse than the standard mean ratings collaborative filter. This indicates that user's gender isn't a strong indicator to user's taste.

## Gender and occupation:

In [28]:
gen_occ_mean = 	merged_df[['sex',	'rating',	'movie_id',	'occupation']] \
      .pivot_table( values='rating',	index='movie_id',	columns=['occupation',	'sex'],	aggfunc='mean')
gen_occ_mean.head()

occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,...,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
sex,F,M,F,M,M,F,M,F,M,F,...,F,M,F,M,F,M,F,M,F,M
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,3.9375,3.75,5.0,3.4,3.666667,3.25,3.884615,4.0,4.083333,4.0,...,,4.0,3.5,4.0,4.043478,3.796296,4.0,3.75,4.0,3.0
2,3.0,3.666667,,,,4.0,3.5,,3.066667,,...,,,,3.0,2.666667,3.277778,,2.714286,,2.333333
3,3.5,4.0,,,,,2.0,,3.777778,,...,,,,,3.0,3.391304,,4.25,,1.0
4,3.666667,3.6,,4.666667,3.0,2.5,3.8,4.0,3.65,,...,4.0,4.0,,3.4,3.25,3.777778,,3.333333,4.25,3.25
5,4.0,2.333333,,,,4.0,2.333333,,3.5,,...,,,,4.0,4.333333,3.111111,,3.333333,4.0,2.0


In [29]:
#Gender	and	Occupation	Based	Collaborative	Filter	using	Mean	Ratings
def	cf_gen_occ(user_id,	movie_id):

  #Check	if	movie_id	exists	in	gen_occ_mean
  if	movie_id	in	gen_occ_mean.index:
    
    #Identify	the	user
    user	=	users.loc[user_id]
    
    #Identify	the	gender	and	occupation
    gender	=	user['sex']
    occ	=	user['occupation']
    
    #Check	if	the	occupation	has	rated	the	movie
    if	occ	in	gen_occ_mean.loc[movie_id]:
      
      #Check	if	the	gender	has	rated	the	movie
      if	gender	in	gen_occ_mean.loc[movie_id][occ]:
        
        #Extract	the	required	rating
        rating	=	gen_occ_mean.loc[movie_id][occ][gender]
        
        #Default	to	3.0	if	the	rating	is	null
        if	np.isnan(rating):
          rating	=	3.0
        
        return	rating
                  
  #Return	the	default	rating	
  return	3.0

In [30]:
score(cf_gen_occ)

1.1419651376788005

This model performs the worst out of all filters. This strongly 	suggests	that	tinkering	with	user
demographic	data	may	not	be	the	best	way	to	go	forward	with	the	data	that	we
are	currently	using.

<a name = 'knn'></a>

# KNN-based Collaborative Filtering:
1.	 Find	the	k-nearest	neighbors	of	u	who	have	rated	movie	m
2.	 Output	the	average	rating	of	the	k	users	for	the	movie	m

In [31]:
#Define	a	Reader	object
#The	Reader	object	helps	in	parsing	the	file	or	dataframe	containing	ratings
reader	=	Reader()
#Create	the	dataset	to	be	used	for	building	the	filter
data	=	Dataset.load_from_df(ratings,	reader)
#Define	the	algorithm	object;	in	this	case	kNN
knn	=	KNNBasic()
#Evaluate	the	performance	in	terms	of	RMSE
cross_validate(knn,	data,	measures=['RMSE'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'fit_time': (0.37975382804870605,
  0.35933470726013184,
  0.373443603515625,
  0.384674072265625,
  0.3691110610961914),
 'test_rmse': array([0.98121798, 0.98113727, 0.98165626, 0.97369217, 0.97585894]),
 'test_time': (3.350297689437866,
  3.34989595413208,
  3.435908079147339,
  3.337796688079834,
  3.451113224029541)}

This is the best model so far achieving the lowest rmse.