In [22]:
import pandas as pd
import numpy as np
from datetime import datetime
from functools import reduce
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix, hstack

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [2]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095482 sha256=a1ce9e1d8011d7090de7fa6ee83b2b12b6f4116916cf01be47fc81fbb4f1ada1
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.

In [15]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [16]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [17]:
movies_with_tags_ratings = pd.merge(movies_with_tags, ratings,  how = 'left',
                                    left_on = ['movieId','userId'], right_on = ['movieId','userId'])


In [18]:
movies_with_tags_ratings.drop(['timestamp_x','timestamp_y'], axis = 1, inplace = True)
movies_with_tags_ratings.dropna(inplace = True)
movies_with_tags_ratings

Unnamed: 0,movieId,title,genres,userId,tag,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,3.5
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,4.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,4.0
...,...,...,...,...,...,...
11818,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,4.0
11840,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,anime,3.5
11841,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,comedy,3.5
11842,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,gintama,3.5


In [19]:
dataset = pd.DataFrame({
    'uid': movies_with_tags_ratings.userId,
    'iid': movies_with_tags_ratings.title,
    'rating': movies_with_tags_ratings.rating
})
dataset.head()

Unnamed: 0,uid,iid,rating
0,336.0,Toy Story (1995),4.0
1,474.0,Toy Story (1995),4.0
2,567.0,Toy Story (1995),3.5
3,62.0,Jumanji (1995),4.0
4,62.0,Jumanji (1995),4.0


In [20]:
reader = Reader(rating_scale = (0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [21]:
algo = KNNWithMeans(k = 50, sim_options = {'name': 'pearson_baseline', 'user_based': True})
cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5520  0.5530  0.5018  0.5854  0.5933  0.5571  0.0323  
MAE (testset)     0.2168  0.2050  0.1915  0.2290  0.2418  0.2168  0.0176  
Fit time          0.02    0.02    0.02    0.02    0.03    0.02    0.00    
Test time         0.03    0.03    0.04    0.05    0.

{'test_rmse': array([0.55198513, 0.55302574, 0.50176038, 0.585375  , 0.59330009]),
 'test_mae': array([0.2167587 , 0.20497301, 0.19152369, 0.22904737, 0.24181454]),
 'fit_time': (0.01750040054321289,
  0.01944279670715332,
  0.015237092971801758,
  0.015403270721435547,
  0.02529287338256836),
 'test_time': (0.0349881649017334,
  0.03199625015258789,
  0.04479837417602539,
  0.04864215850830078,
  0.040184736251831055)}