In [1]:
import pandas as pd
import numpy as np

from numpy import dot
from numpy.linalg import norm

# Utils

In [2]:
def get_ratings_by_item(
    df, users_df, items, rating_column='Rating', user_column='UserId', item_column='ItemId'):
    
    aux = df.loc[df[item_column].isin(items)]
    aux = aux.pivot_table(index=user_column, columns=item_column, values=rating_column)
    users_df = users_df.merge(aux, how='left', on=[user_column])
    users_df = users_df.fillna('0')

    return users_df

In [3]:
def get_items_by_users(
    df, users, user_column='UserId', item_column='ItemId'):
    """
    Get items id evaluates by user target.

    Args:
      df: ps.DataFrame
          predict rating value
      user: str
          user id
      user_column: str, default UserId
          name of column with users id
      item_column: str, default ItemId
      
    Returns: 
    
    list of strings with all items ids evaluated by user target
    """
    
    items = [list(df.loc[df[user_column] == user][item_column]) for user in users]
    items = sorted(set(sum(items, [])))
    
    return items

In [4]:
def get_common_users_items(
    df, item, user_column='UserId', item_column='ItemId'):
    """
    Get the ids of users that evaluate a item.

    Args:
      df: ps.DataFrame
          predict rating value
      item: str
          item id
      user_column: str, default UserId
          name of column with users id
      item_column: str, default ItemId
          name of column with items id
    Returns: 
    
    list of strings with all users ids that evaluate a target item
    """
    
    return list(df.loc[df[item_column] == item][user_column])

In [5]:
def cosine_similarity(x, y):
    """
    Calculates cosine similarity between two vectors.

    Args:
      x: np.array
         vector of values
      y: np.array
         vector of valuesitem id
         
    Returns: 
    
    cosine similarity between x and y
    """
    
    return np.dot(x, y)/(norm(x)*norm(y))

# Item-based collaborative filtering

Users who like an item tend to like similar items

In [6]:
def combinations(items):
    
    comb = []
    k = 0
    for i in items:
        for j in range(1 + k, len(items)):
            comb.append((i, items[j]))
        k+=1

    return comb

In [7]:
def get_item_vector(item, users_df, ratings_df):
    
    aux = ratings_df.loc[ratings_df['ItemId'] == item][['UserId', 'Rating']]
    df = users_df.merge(aux, how='left', on=['UserId']).fillna(0)
    
    return df

In [8]:
def pairwise_item_sim(comb, users_df, ratings_df):
    
    similarities = []
    
    for i in range(len(comb)):
    
        i1 = get_item_vector(comb[i][0], users_df, ratings_df)['Rating']
        i2 = get_item_vector(comb[i][1], users_df, ratings_df)['Rating']

        similarities.append(cosine_similarity(i1, i2))
        
    df = pd.DataFrame({"items": comb, "similarity": similarities})

    return df

In [9]:
def get_item_neighbors(item_sim, item_target, ratings_df):
    
    common_users = get_common_users_items(ratings_df, item=item_target)
    
    if common_users == []:
        return None
        
    candidate_items = get_items_by_users(ratings_df, users=common_users)
    
    if candidate_items == []:
        return None
    
    neighbors = item_sim[item_sim['items'].apply(lambda x: item_target in x)]
    neighbors = neighbors.loc[neighbors['items'].apply(lambda x: any(candidate in x for candidate in candidate_items))]
    neighbors = neighbors.reset_index(drop=True)
    
    return neighbors

In [10]:
def calc_rmse(y_pred, y_true):
    """
    Calculate root mean squared error.

    Args:
      y_pred: float
          predict rating value
      y_true: float
          true rating value
    Returns: 
    
    rmse: float
        root mean squared error between predict rating values and true ratings values
    """
    
    return np.sqrt(((y_pred - y_true) ** 2).mean())

In [11]:
def calc_mae(y_pred, y_true):
    """
    Calculate mean absolute error.

    Args:
      y_pred: float
          predict rating value
      y_true: float
          true rating value
    
    Returns: 
    
    mae: float
        mean absolute error between predict rating values and true ratings values
    """
    return np.absolute(y_pred - y_true).mean()

In [12]:
def train_test_split(X, test_split=0.2):
    """
    Split dataset in train and test.

    Args:
      X: np.array
          ratings 
      test_split: float
          percentage of data for the test dataset 
    
    Returns: 
    
    train: np.array
        train dataset
    test:
        test dataset
    """
    
    size = int(X.shape[0])
    all_indexes = list(range(size))
                       
    indexes_test = list(np.random.choice(np.arange(0,size), int(size*test_split), replace=False))
    indexes_train = set(all_indexes) - set(indexes_test)
    
    train = X.loc[indexes_train].reset_index(drop=True)
    test = X.loc[indexes_test].reset_index(drop=True)
    
    return train, test

In [13]:
def clip(pred, max_pred=5, min_pred=1):
    """
    Clip predict values in range [min_pred, max_pred].

    Args:
      pred : float
          predict value
      max_pred : int
          max value of rating
      min_pred : int
          min value of rating

    Returns: 

    pred: int
        predict value cliped
    """
    
    pred = max_pred if pred > max_pred else pred
    pred = min_pred if pred < min_pred else pred
    
    return pred

In [16]:
def predict(X, pu, qi, bu, bi, num_factors, mean, max_pred=5, min_pred=1):
    """
    Predict rating for all pairs users items.

    Args:
      X: np.array
          rating matrix
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      num_factors: int
          number of latent factors
      mean: float
          mean ratings of all users
      max_pred: int
          highest possible prediction
      min_pred: int
          lowest possible prediction

    Returns: 

    y_pred: np.array
        predict values
    y_true: np.array
        targets values
    """
    
    y_pred = []
    y_true = []
    
    for idx in range(X.shape[0]):
        
        user, item, rui = X[idx, 0], X[idx, 1], X[idx, 2]

        if (user > -1) and (item > -1):
            pred = one_predict(pu, qi, bu, bi, user, item, num_factors, mean)
            
        pred = clip(pred, max_pred, min_pred)
        
        y_pred.append(pred)
        y_true.append(rui)
    
    return np.asarray(y_pred), np.asarray(y_true)

In [17]:
def update_weights(pu, qi, user, item, eui, num_factors, alpha=0.1, lamb=0.02):
    """
    Update user and items matrix weights.

    Args:
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      user: str
          user code
      item: str
          item code
      eui: float
          predict error
      num_factor: int
          number of latent factors
      alpha: float
          learning rate

    Returns: 

    pu: np.array
        user matrix updated
    qi: np.array
          item matrix updated
    """
    
    for k in range(num_factors):
        
        puf = alpha * (eui * qi[k, item] - lamb * pu[user, k])
        qif = alpha * (eui * pu[user, k] - lamb * qi[k, item]) 
        
        pu[user, k] +=  puf
        qi[k, item] +=  qif  
        
            
    return pu, qi

In [141]:
def update_bias(bu, bi, user, item, eui, alpha, lamb):
    """"
    Update bias weights.

    Args:
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      user: str
          user code
      item: str
          item code
      eui: float
          predict error
      num_factor: int
          number of latent factors
      alpha: float
          learning rate

    Returns: 

    pu: np.array
        user matrix updated
    qi: np.array
          item matrix updated
    """
    
    bu[user] += alpha * (eui * bu[user] - lamb * bi[item])
    bi[item] += alpha * (eui * bi[item] - lamb * bi[item])
    
    return bu, bi

In [19]:
def one_predict(pu, qi, bu, bi, user, item, num_factors, mean):
    """
    Predict rating to a pair user item.

    Args:
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      user: str
          user code
      item: str
          item code
      num_factor: int
          number of latent factors
      mean: float
          mean ratings of all users

    Returns: 

    float: predicted value      
    """
    
    return sum([pu[user, k] * qi[k, item] for k in range(num_factors)]) + mean + bu[user] + bi[item]

In [20]:
def inicialization(n, m, num_factors):
    """
    Initialize weights.

    Args:
      n: int
          number of unique users in ratings data
      m: int
          number of unique itens in ratings data
      num_factors: int
          number of latent factors

    Returns: 

    pu: np.array
        user matrix random initialize
    qi: np.array
        item matrix random initialize
    bu: np.array
        bias vector for user weights initialize with zeros
    bi: np.array
        bias vector for item weights initialize with zeros
    """
    
    pu = np.random.rand(n, num_factors)
    qi = np.random.rand(m, num_factors)
    
    bu = np.zeros(n)
    bi = np.zeros(m)
    
    return pu, qi, bu, bi  

In [21]:
def scheduler(epoch, alpha):
    """
    Learning rating scheduler.
    
    Args:
      epoch: int
          actual epoch in training
      alpha: int
          learning rating

    Returns: 

    learning rate update 
    """
    
    if epoch < 5:
        return alpha
    else:
        return alpha * np.exp(-0.1)

In [142]:
def SGD(X, num_factors, n, m, mean, alpha=0.1, lamb=0.02):
    """
    Stochastic Gradiend Descent.

    Args:
      X: np.array
          ratings data 
      num_factor: int
          number of latent factors
      n: int
          number of unique users in ratings data
      m: int
          number of unique itens in ratings data
      alpha: float
          learning rate

    Returns: 

    pu: np.array
        user matrix factored
    qi: np.array
        item matrix factored
    bu: np.array
        bias vector for user weights
    bi: np.array
        bias vector for item weights
    """
    
    pu, qi, bu, bi = inicialization(n, m, num_factors)
    
    qi = qi.T
    
    for idx in range(X.shape[0]):
        
        user, item, rui = X[idx, 0], X[idx, 1], X[idx, 2]
        
        #predict rating
        pred = one_predict(pu, qi, bu, bi, user, item, num_factors, mean)
        
        #calculate error
        eui = rui - pred
        
        #update bias
        bu, bi = update_bias(bu, bi, user, item, eui, alpha, lamb)
        
        #Adjust weights
        pu, qi = update_weights(pu, qi, user, item, eui, num_factors, alpha, lamb)
        
        
    return pu, qi, bu, bi

In [87]:
def fit(X_train, num_factors, n, m, alpha=0.1, lamb=0.02, epochs=10, verbose=False):
    """
    Fit Stochastic Gradiend Descent.

    Args:
      X_train: np.array
          ratings data used to create the factored matrixes
      num_factor: int
          number of latent factors
      n: int
          number of unique users in ratings data
      m: int
          number of unique itens in ratings data
      alpha: float
          learning rate
      epochs: int
          number of steps 
      verbose: boolean
          if true, show all informations printed in functions

    Returns: 

    pu: np.array
        user matrix factored
    qi: np.array
        item matrix factored   
    rmse: float
        root mean squared error between predict rating values and true ratings values
    mae: float
        mean absolute error between predict rating values and true ratings values
    """
    
    mean = np.mean(X_train[:, 2])
    
    for epoch in range(epochs):
        
        alpha = scheduler(epoch, alpha)
        
        pu, qi, bu, bi = SGD(X_train, num_factors, n, m, mean, alpha=alpha, lamb=lamb)
        
        y_pred, y_true = predict(X_train, pu, qi, bu, bi, num_factors, mean)
        
        rmse = calc_rmse(y_pred, y_true)
        mae = calc_mae(y_pred, y_true)
        
        if rmse < 0.01:
            break
            
        if verbose:
            print("Epoch: {} - RMSE: {:.5f} - MAE: {:.5f}".format(epoch, rmse, mae))
            
    return pu, qi, bu, bi, rmse, mae

In [24]:
def create_df (df, users, items, user_column='UserId', item_column='ItemId'):
    """
    Create a new rating dataframe where all users and itens are mapped for a continuous integer value
    and.

    Args:
      df: pd.DataFrame
          ratings_df
      user: list
          list with unique users code
      items: str
          list with unique items code
      user_column: str, defaul UserId
          column name of users
      item_column: str, default ItemId
          column name of items

    Returns: 

    df: pandas DataFrame with all users items ratings
    dict_users: dictionary mapped users and your new code
    dict_items: dictionary mapped items and your new code     
    """
    
    dict_users = dict(zip(users, range(len(users))))
    dict_items = dict(zip(items, range(len(items))))

    df[user_column] = df[user_column].map(dict_users)
    df[item_column] = df[item_column].map(dict_items)

    df = df.fillna(-1)
    
    return np.asarray(df), df, dict_users, dict_items

In [25]:
def pairwise_item_sim(comb, users_df, ratings_df, qi):
    """
    Calculate the cosine similarity between two pairs itens.

    Args:
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      user: str
          user code
      item: str
          item code
      num_factor: int
          number of latent factors
      mean: float
          mean ratings of all users

    Returns: 

    float: predicted value      
    """
    
    similarities = []
    
    for i in range(len(comb)):
    
        i1 = qi[dict_items.get(comb[i][0])]
        i2 = qi[dict_items.get(comb[i][1])]

        similarities.append(cosine_similarity(i1, i2))
        
    df = pd.DataFrame({"items": comb, "similarity": similarities})

    return df

In [86]:
def main_matrix_factorization(
    num_factors=100, alpha=0.05, lamb=0.02, epochs=20,
    user_column='UserId', item_column='ItemId', verbose=True):
    
    users = pd.unique(ratings_df[user_column]).tolist()
    items = pd.unique(ratings_df[item_column]).tolist()

    num_users = len(users)
    num_items = len(items)

    r, r_df, dict_users, dict_items = create_df (
        ratings_df.copy(), users, items, user_column=user_column, item_column=item_column)

    pu, qi, bu, bi, rmse, mae = fit(
        r, num_factors=num_factors, n=num_users, m=num_items, alpha=alpha, lamb=lamb, epochs=epochs, verbose=True)
    
    return pu, qi, bu, bi, dict_users, dict_items, r_df, rmse, mae

In [134]:
def predict_one_rating(row, pu, qi, bu, bi, user_mean):

    user_target = dict_users.get(str(row[0]))
    item_target = dict_items.get(str(row[1]))

    pred = np.dot(qi[item_target], pu[user_target])
    pred = clip(np.round(pred + user_mean + bu[user_target] + bi[item_target]))
    
    return pred

In [96]:
def predict_all_ratings(
    df, ratings_df, pu, qi, bu, bi, dict_users, dict_items,
    n_neighboors, user_column, item_column, rating_column):
  
    user_mean = np.mean(ratings_df['Rating'])
    
    df['ui'] = list(zip(df[user_column], df[item_column]))
    
    vfunc = np.vectorize(predict_one_rating, excluded=['pu', 'qi', 'bu', 'bi', 'user_mean'])
    
    df[rating_column] = vfunc(row=df['ui'], pu=pu, qi=qi, bu=bu, bi=bi, user_mean=user_mean)
    
    return df[[user_column, item_column, rating_column]]

In [None]:
import time
user_column = 'UserId'
item_column = 'ItemId'
rating_column = 'Rating'

start_time = time.time()

ratings = pd.read_csv("ratings.csv")
ratings['UserId:ItemId'] = ratings['UserId:ItemId'].str.split(':')
ratings_df = pd.DataFrame(ratings['UserId:ItemId'].to_list(), columns=['UserId', 'ItemId'])
ratings_df['Rating'] = ratings['Rating']

targets = pd.read_csv("targets.csv")
targets['UserId:ItemId'] = targets['UserId:ItemId'].str.split(':')
targets_df = pd.DataFrame(targets['UserId:ItemId'].to_list(), columns=['UserId', 'ItemId'])

pu, qi, bu, bi, dict_users, dict_items, r_df, rmse, mae = main_matrix_factorization(num_factors=100, alpha=0.05, epochs=10)
qi = qi.T

result = predict_all_ratings(
    targets_df, ratings_df, pu, qi, bu, bi, dict_users, dict_items,
    n_neighboors, user_column, item_column, rating_column)

result.to_csv("predict.csv", index=False)

print("--- %s seconds ---" % (time.time() - start_time))

Epoch: 0 - RMSE: 2.19264 - MAE: 1.55658
Epoch: 1 - RMSE: 2.19668 - MAE: 1.56145
Epoch: 2 - RMSE: 2.19037 - MAE: 1.55331
Epoch: 3 - RMSE: 2.19558 - MAE: 1.56019
Epoch: 4 - RMSE: 2.19140 - MAE: 1.55398
Epoch: 5 - RMSE: 2.04673 - MAE: 1.41796
Epoch: 6 - RMSE: 1.97192 - MAE: 1.34993
Epoch: 7 - RMSE: 1.96954 - MAE: 1.35519
Epoch: 8 - RMSE: 2.00407 - MAE: 1.40166


In [211]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /scratch/cinthiasouza/.surprise_data/ml-100k
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9296  0.9371  0.9438  0.9356  0.9320  0.9356  0.0049  
MAE (testset)     0.7346  0.7387  0.7440  0.7353  0.7349  0.7375  0.0036  
Fit time          8.04    8.32    7.55    7.50    7.54    7.79    0.33    
Test time         9.20    0.31    0.28    0.28    0.28    2.07    3.57    


{'test_rmse': array([0.92963382, 0.93709741, 0.94383365, 0.9355879 , 0.93198892]),
 'test_mae': array([0.73459561, 0.73868008, 0.74398445, 0.73528318, 0.73489192]),
 'fit_time': (8.041144609451294,
  8.323009490966797,
  7.553421258926392,
  7.499321699142456,
  7.544216632843018),
 'test_time': (9.200531482696533,
  0.30553770065307617,
  0.2824246883392334,
  0.2825336456298828,
  0.27904582023620605)}

In [29]:
def fill_ratings_by_user(
   ratings_df, item_ratings, candidate_items, users, dict_users, dict_items,
    user_target, user_column, item_column):
    
    pred_ratings = {}

    for item in candidate_items:
        
        aux = {}
        
        for user in users:
            
            user_ratings = item_ratings.loc[item_ratings[user_column] == user]
            user_mean = np.mean(ratings_df.loc[ratings_df[user_column] == user]['Rating'])
          
            rui = user_ratings.loc[user_ratings[item_column] == item]
    
            if not rui.empty:
                aux[user] = int(rui['Rating'])
            else:
                aux[user] = clip(np.round(pu[dict_users.get(user)].dot(qi[dict_items.get(item)].T) + user_mean))

        pred_ratings[item]= aux
        
    return pred_ratings

In [27]:
def fill_user_target_ratings(
    known_ratings, candidate_items, dict_users, dict_items, user_target, user_mean, item_column):
    
    pred_ratings = {}

    for j in range(1, len(candidate_items)):
    
        x = known_ratings.loc[known_ratings[item_column] == candidate_items[j]]
    
        if not x.empty:
            pred_ratings[candidate_items[j]] = int(x['Rating'])
        else:
            pred_ratings[candidate_items[j]] = clip(np.round(pu[dict_users.get(user_target)].dot(qi[dict_items.get(candidate_items[j])].T) + user_mean))
        
    return pred_ratings

In [28]:
def items_similarity():
    
    item_sim = {}
    target = aux_item[0] -user_mean
    
    for j in range(1, len(candidate_items)):
        
        candidate = aux_item[0] - user_mean
        
    item_sim[candidate_items[j]] = cosine_similarity(aux_item[0], aux_item[j]) 

In [30]:
def calculate_unknow_rating(user_target_ratings, item_sim, candidate_items):
    
    x = [np.dot(user_target_ratings[candidate_items[j]], item_sim[candidate_items[j]]) for j in range(1, len(candidate_items))]
    d = [np.abs(i) for i in item_sim.values()]

    pred = clip(np.round(sum(x)/sum(d)))
    
    return pred

In [31]:
def calculate_similarity(pred_ratings, item_target):

    item_target_vector = np.asarray(list(pred_ratings[item_target].values()))

    item_sim = {}

    for key in  pred_ratings.keys():
    
        if key != item_target:
            item_sim[key] =  cosine_similarity(item_target_vector, np.asarray(list(pred_ratings[key].values())))
            
    return item_sim

In [33]:
def item_based_predictv2(
    aux, ratings_df, pu, qi, dict_users, dict_items,
    n_neighboors, user_column, item_column):
    
  
    results = {'user': [], 'item': [], 'pred': []}

    for  idx, row in aux.iterrows():

        user_target = row[user_column]
        item_target = row[item_column]

        results['user'].append(user_target)
        results['item'].append(item_target)

        common_users = get_common_users_items(ratings_df, item=item_target)

        candidate_items = get_items_by_users(ratings_df, users=common_users)
        candidate_items.remove(item_target)
        candidate_items = candidate_items[:n_neighboors]
        candidate_items.insert(0, item_target)

        item_ratings = ratings_df.loc[ratings_df[item_column].isin(candidate_items)].reset_index(drop=True)
        pred_ratings = fill_ratings_by_user(
                ratings_df, item_ratings, candidate_items, common_users, dict_users,
                dict_items, user_target, user_column, item_column)

        item_sim = calculate_similarity(pred_ratings, item_target)

        #item_sim = {candidate_items[j]: cosine_similarity(qi[dict_items.get(candidate_items[0])], qi[dict_items.get(candidate_items[j])]) for j in range(1, len(candidate_items))}

        known_ratings = ratings_df.loc[ratings_df[user_column] == user_target].reset_index(drop=True)
        user_mean = np.mean(ratings_df['Rating'])
        user_target_ratings = fill_user_target_ratings(
                known_ratings, candidate_items, dict_users, dict_items, user_target, user_mean, item_column)

        try:
            pred = calculate_unknow_rating(user_target_ratings, item_sim, candidate_items)
            results['pred'].append(pred)
            print(pred)
        except ZeroDivisionError:
            results['pred'].append(0)
            
    return results


In [234]:
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

In [236]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)

Using ALS


In [216]:
reader = Reader(rating_scale=(1, 5))

In [238]:
data = Dataset.load_from_df(ratings_df, reader)

In [242]:
aux20 = targets_df
aux20['Ratings'] = [0]*(len(aux20))

In [243]:
test = Dataset.load_from_df(aux20, reader)

In [264]:
trainset, testset = train_test_split(data, test_size=0.1)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 1.1562


1.1561779894931252

In [265]:
y_pred = algo.test(testset)

In [266]:
new_preds = [[i.uid, i.iid, i.r_ui] for i in y_pred]

In [273]:
aux = pd.DataFrame(new_preds)
aux = aux.rename(columns={0: user_column, 1: item_column, 2: "Rating"})

In [None]:
aux 

In [None]:
results_df

In [223]:
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=2, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    1.1780  1.1749  1.1765  0.0015  
MAE (testset)     0.8948  0.8943  0.8946  0.0003  
Fit time          22.96   22.22   22.59   0.37    
Test time         3.62    3.06    3.34    0.28    


{'test_rmse': array([1.178034  , 1.17493987]),
 'test_mae': array([0.89484386, 0.89431522]),
 'fit_time': (22.964514017105103, 22.22034525871277),
 'test_time': (3.6192076206207275, 3.0629162788391113)}

In [227]:
predictions = algo.test(data)
accuracy.rmse(predictions, verbose=True)

TypeError: 'DatasetAutoFolds' object is not iterable

In [224]:
# recommender.py

from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

In [230]:
trainingSet = data.build_full_trainset()
algo.fit(trainingSet)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fbbf9a56d00>

In [231]:
prediction = algo.predict('E', 2)
prediction.est

4.19

In [232]:
prediction

Prediction(uid='E', iid=2, r_ui=None, est=4.19, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})