In [2]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [3]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings[movies_with_ratings.userId == 2.0].title.unique()

array(['Shawshank Redemption, The (1994)', 'Tommy Boy (1995)',
       'Good Will Hunting (1997)', 'Gladiator (2000)',
       'Kill Bill: Vol. 1 (2003)', 'Collateral (2004)',
       'Talladega Nights: The Ballad of Ricky Bobby (2006)',
       'Departed, The (2006)', 'Dark Knight, The (2008)',
       'Step Brothers (2008)', 'Inglourious Basterds (2009)',
       'Zombieland (2009)', 'Shutter Island (2010)',
       'Exit Through the Gift Shop (2010)', 'Inception (2010)',
       'Town, The (2010)', 'Inside Job (2010)',
       'Louis C.K.: Hilarious (2010)', 'Warrior (2011)',
       'Dark Knight Rises, The (2012)',
       'Girl with the Dragon Tattoo, The (2011)',
       'Django Unchained (2012)', 'Wolf of Wall Street, The (2013)',
       'Interstellar (2014)', 'Whiplash (2014)', 'The Drop (2014)',
       'Ex Machina (2015)', 'Mad Max: Fury Road (2015)',
       'The Jinx: The Life and Deaths of Robert Durst (2015)'],
      dtype=object)

In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [8]:
ratings.rating.min()

0.5

In [9]:
ratings.rating.max()

5.0

In [7]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)
# data_1 = Dataset.load_from_df(movies_with_ratings, reader)

In [8]:
trainset, testset = train_test_split(data, test_size=.15)

------------------------------

## SVDpp
## RMSE: 0.8553

In [9]:
from surprise import SVDpp

In [10]:
algo = SVDpp()
# algo.fit(trainset)

In [11]:
from surprise.model_selection import cross_validate
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8646  0.8659  0.8567  0.8582  0.8573  0.8606  0.0039  
Fit time          706.99  748.16  747.68  682.46  695.42  716.15  27.08   
Test time         11.86   13.90   11.80   11.68   11.43   12.13   0.90    


{'test_rmse': array([0.86464404, 0.86594953, 0.85673488, 0.85821721, 0.85725045]),
 'fit_time': (706.9949908256531,
  748.1624240875244,
  747.6817917823792,
  682.4644799232483,
  695.4216659069061),
 'test_time': (11.861483097076416,
  13.902275800704956,
  11.796839952468872,
  11.680114984512329,
  11.42816710472107)}