Task 04

Create a basic recommendation system using
collaborative filtering.

Task Details: Develop a recommendation system that
suggests items to users based on their past
interactions and preferences.

Dataset: MovieLens Dataset

# Item Based Collaborative filtering

## data preprocessing

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
ratings = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv')
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [None]:
movies = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv')
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [None]:
df = movies.merge(ratings, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


## creating user-movie dataframe

In [None]:
comment_counts = pd.DataFrame(df['title'].value_counts())
comment_counts

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
Forrest Gump (1994),329
"Shawshank Redemption, The (1994)",317
Pulp Fiction (1994),307
"Silence of the Lambs, The (1991)",279
"Matrix, The (1999)",278
...,...
Pride (2007),1
We're Back! A Dinosaur's Story (1993),1
Black Book (Zwartboek) (2006),1
Are We Done Yet? (2007),1


In [None]:
rare_movies = comment_counts[comment_counts['count'] <= 1000]
rare_movies

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
Forrest Gump (1994),329
"Shawshank Redemption, The (1994)",317
Pulp Fiction (1994),307
"Silence of the Lambs, The (1991)",279
"Matrix, The (1999)",278
...,...
Pride (2007),1
We're Back! A Dinosaur's Story (1993),1
Black Book (Zwartboek) (2006),1
Are We Done Yet? (2007),1


In [None]:
common_movies = df[~df["title"].isin(rare_movies)]
common_movies

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,9.649827e+08
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.474350e+08
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.106636e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.510578e+09
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.305696e+09
...,...,...,...,...,...,...
100849,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1.537109e+09
100850,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,1.537110e+09
100851,193585,Flint (2017),Drama,184.0,3.5,1.537110e+09
100852,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184.0,3.5,1.537110e+09


In [None]:
user_movie_df = common_movies.pivot_table(index=["userId"],
                                          columns=["title"],
                                          values="rating")
print(user_movie_df.columns)

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)


In [None]:
user_movie_df

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,4.0,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606.0,,,,,,,,,,,...,,,,,,,,,,
607.0,,,,,,,,,,,...,,,,,,,,,,
608.0,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609.0,,,,,,,,,,,...,,,,,,,,,,


## item based movie recommendation

In [None]:
movie_name = 'Limitless (2011)'
movie_name = user_movie_df[movie_name]
user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(10)


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Pan (2015),1.0
San Andreas (2015),1.0
Safe House (2012),1.0
Runner Runner (2013),1.0
Runaway Jury (2003),1.0
"Best Offer, The (Migliore offerta, La) (2013)",1.0
Big Eyes (2014),1.0
Remo Williams: The Adventure Begins (1985),1.0
Reign Over Me (2007),1.0
Red Planet (2000),1.0


In [None]:
def check_film(keyword, user_movie_df):
  return [col for col in user_movie_df.columns if keyword in col]

check_film("Limit", user_movie_df)

['Darjeeling Limited, The (2007)',
 "Hotel Chevalier (Part 1 of 'The Darjeeling Limited') (2007)",
 'Limitless (2011)',
 'Limits of Control, The (2009)',
 'Sunset Limited, The (2011)',
 'Vertical Limit (2000)']

#  Model Based Collaborative filtering

In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357284 sha256=2f662507edd82c549524b2d08bd5b43f853582273db11b79bad262bdd7303da4
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Install

In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [11]:
ratings = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv')
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [12]:
movies = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv')
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [13]:
df = movies.merge(ratings, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [14]:
# names and ids of 4 movies in terms of traceability
movie_ids = [1,5,6,7]
movies = ["Toy Story (1995)",
          "Father of the Bride Part II (1995)",
          "Heat (1995)",
          "Sabrina (1995)	"]

sample_df = df[df.movieId.isin(movie_ids)]
sample_df.head()

user_movie_df = sample_df.pivot_table(index=['userId'], columns=['title'], values='rating')
user_movie_df

title,Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Toy Story (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,,4.0,,4.0
5.0,,,,4.0
6.0,5.0,4.0,4.0,
7.0,,,,4.5
11.0,,5.0,,
...,...,...,...,...
606.0,,,2.5,2.5
607.0,,,,4.0
608.0,,,,2.5
609.0,,,,3.0


In [15]:
from surprise import Reader, SVD, Dataset, accuracy

In [16]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(sample_df[["userId", "movieId","rating"]], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7917f57536d0>

In [17]:
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

train, test = train_test_split(data, test_size=0.25)
svd_model = SVD()
svd_model.fit(train)
prediction = svd_model.test(test)
print(prediction)

[Prediction(uid=601.0, iid=1, r_ui=4.0, est=3.9037057719614747, details={'was_impossible': False}), Prediction(uid=150.0, iid=6, r_ui=4.0, est=4.024940100303031, details={'was_impossible': False}), Prediction(uid=5.0, iid=1, r_ui=4.0, est=3.9037057719614747, details={'was_impossible': False}), Prediction(uid=147.0, iid=5, r_ui=4.5, est=2.936478976824014, details={'was_impossible': False}), Prediction(uid=357.0, iid=1, r_ui=5.0, est=3.9037057719614747, details={'was_impossible': False}), Prediction(uid=169.0, iid=7, r_ui=4.5, est=3.0125790877579655, details={'was_impossible': False}), Prediction(uid=347.0, iid=1, r_ui=5.0, est=3.9037057719614747, details={'was_impossible': False}), Prediction(uid=42.0, iid=7, r_ui=3.0, est=3.1171894427080185, details={'was_impossible': False}), Prediction(uid=493.0, iid=6, r_ui=4.0, est=3.9129656314054664, details={'was_impossible': False}), Prediction(uid=32.0, iid=6, r_ui=3.0, est=3.8391835350781514, details={'was_impossible': False}), Prediction(uid=

In [None]:
# r_ui = real rate
# est = estimated rate
# iid = item id
# uid = user id

In [18]:

accuracy.rmse(prediction)

RMSE: 0.8431


0.8431464519839251

In [19]:
sample_df[sample_df["userId"] == 1]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0
433,6,Heat (1995),Action|Crime|Thriller,1.0,4.0,964982224.0


In [20]:
svd_model.predict(uid=1.0, iid=1, verbose=True)

user: 1.0        item: 1          r_ui = None   est = 3.75   {'was_impossible': False}


Prediction(uid=1.0, iid=1, r_ui=None, est=3.748754956025655, details={'was_impossible': False})

In [21]:
svd_model.predict(uid=1.0, iid=6, verbose=True)

user: 1.0        item: 6          r_ui = None   est = 3.90   {'was_impossible': False}


Prediction(uid=1.0, iid=6, r_ui=None, est=3.904844557485123, details={'was_impossible': False})

In [22]:
svd_model.predict(uid=1.0, iid=5, verbose=True)

user: 1.0        item: 5          r_ui = None   est = 2.88   {'was_impossible': False}


Prediction(uid=1.0, iid=5, r_ui=None, est=2.884883862675717, details={'was_impossible': False})

In [23]:
svd_model.predict(uid=1.0, iid=7, verbose=True)

user: 1.0        item: 7          r_ui = None   est = 3.18   {'was_impossible': False}


Prediction(uid=1.0, iid=7, r_ui=None, est=3.1751013619513215, details={'was_impossible': False})

In [24]:
svd_model.predict(uid=5.0, iid=1, verbose=True)

user: 5.0        item: 1          r_ui = None   est = 3.90   {'was_impossible': False}


Prediction(uid=5.0, iid=1, r_ui=None, est=3.9037057719614747, details={'was_impossible': False})

In [25]:
param_grid = {"n_epochs": [5, 10, 20],
              "lr_all": [0.002, 0.005, 0.007]}

gs = GridSearchCV(SVD,
                  param_grid,
                  measures=["rmse", "mae"],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)

gs.fit(data)

gs.best_score["rmse"]
gs.best_params["rmse"]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.1s finished


{'n_epochs': 20, 'lr_all': 0.005}

In [26]:
svd_model_tuned = SVD(**gs.best_params["rmse"])

In [27]:
# train the whole dataset
data = data.build_full_trainset()
svd_model_tuned.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7917f5751720>

In [28]:
svd_model_tuned.predict(uid=1.0, iid=1, verbose=True)

user: 1.0        item: 1          r_ui = None   est = 3.97   {'was_impossible': False}


Prediction(uid=1.0, iid=1, r_ui=None, est=3.9722809029318737, details={'was_impossible': False})

In [29]:
svd_model.predict(uid=1.0, iid=6, verbose=True)

user: 1.0        item: 6          r_ui = None   est = 3.90   {'was_impossible': False}


Prediction(uid=1.0, iid=6, r_ui=None, est=3.904844557485123, details={'was_impossible': False})

In [30]:
svd_model.predict(uid=1.0, iid=5, verbose=True)

user: 1.0        item: 5          r_ui = None   est = 2.88   {'was_impossible': False}


Prediction(uid=1.0, iid=5, r_ui=None, est=2.884883862675717, details={'was_impossible': False})

In [31]:
svd_model.predict(uid=1.0, iid=7, verbose=True)

user: 1.0        item: 7          r_ui = None   est = 3.18   {'was_impossible': False}


Prediction(uid=1.0, iid=7, r_ui=None, est=3.1751013619513215, details={'was_impossible': False})

In [32]:
svd_model.predict(uid=5.0, iid=1, verbose=True)

user: 5.0        item: 1          r_ui = None   est = 3.90   {'was_impossible': False}


Prediction(uid=5.0, iid=1, r_ui=None, est=3.9037057719614747, details={'was_impossible': False})