In [1]:
import pandas as pd
import numpy as np
import time

from numpy import dot
from numpy.linalg import norm

# Utils

In [2]:
def get_ratings_by_item(
    df, users_df, items, rating_column='Rating', user_column='UserId', item_column='ItemId'):
    
    aux = df.loc[df[item_column].isin(items)]
    aux = aux.pivot_table(index=user_column, columns=item_column, values=rating_column)
    users_df = users_df.merge(aux, how='left', on=[user_column])
    users_df = users_df.fillna('0')

    return users_df

In [3]:
def get_items_by_users(
    df, users, user_column='UserId', item_column='ItemId'):
    """
    Get items id evaluates by user target.

    Args:
      df: ps.DataFrame
          predict rating value
      user: str
          user id
      user_column: str, default UserId
          name of column with users id
      item_column: str, default ItemId
      
    Returns: 
    
    list of strings with all items ids evaluated by user target
    """
    
    items = [list(df.loc[df[user_column] == user][item_column]) for user in users]
    items = sorted(set(sum(items, [])))
    
    return items

In [4]:
def get_common_users_items(
    df, item, user_column='UserId', item_column='ItemId'):
    """
    Get the ids of users that evaluate a item.

    Args:
      df: ps.DataFrame
          predict rating value
      item: str
          item id
      user_column: str, default UserId
          name of column with users id
      item_column: str, default ItemId
          name of column with items id
    Returns: 
    
    list of strings with all users ids that evaluate a target item
    """
    
    return list(df.loc[df[item_column] == item][user_column])

In [5]:
def cosine_similarity(x, y):
    """
    Calculates cosine similarity between two vectors.

    Args:
      x: np.array
         vector of values
      y: np.array
         vector of valuesitem id
         
    Returns: 
    
    cosine similarity between x and y
    """
    
    return np.dot(x, y)/(norm(x)*norm(y))

# Item-based collaborative filtering

Users who like an item tend to like similar items

In [6]:
def combinations(items):
    
    comb = []
    k = 0
    for i in items:
        for j in range(1 + k, len(items)):
            comb.append((i, items[j]))
        k+=1

    return comb

In [7]:
def get_item_vector(item, users_df, ratings_df):
    
    aux = ratings_df.loc[ratings_df['ItemId'] == item][['UserId', 'Rating']]
    df = users_df.merge(aux, how='left', on=['UserId']).fillna(0)
    
    return df

In [8]:
def pairwise_item_sim(comb, users_df, ratings_df):
    
    similarities = []
    
    for i in range(len(comb)):
    
        i1 = get_item_vector(comb[i][0], users_df, ratings_df)['Rating']
        i2 = get_item_vector(comb[i][1], users_df, ratings_df)['Rating']

        similarities.append(cosine_similarity(i1, i2))
        
    df = pd.DataFrame({"items": comb, "similarity": similarities})

    return df

In [9]:
def get_item_neighbors(item_sim, item_target, ratings_df):
    
    common_users = get_common_users_items(ratings_df, item=item_target)
    
    if common_users == []:
        return None
        
    candidate_items = get_items_by_users(ratings_df, users=common_users)
    
    if candidate_items == []:
        return None
    
    neighbors = item_sim[item_sim['items'].apply(lambda x: item_target in x)]
    neighbors = neighbors.loc[neighbors['items'].apply(lambda x: any(candidate in x for candidate in candidate_items))]
    neighbors = neighbors.reset_index(drop=True)
    
    return neighbors

In [10]:
def calc_rmse(y_pred, y_true):
    """
    Calculate root mean squared error.

    Args:
      y_pred: float
          predict rating value
      y_true: float
          true rating value
    Returns: 
    
    rmse: float
        root mean squared error between predict rating values and true ratings values
    """
    
    return np.sqrt(((y_pred - y_true) ** 2).mean())

In [11]:
def calc_mae(y_pred, y_true):
    """
    Calculate mean absolute error.

    Args:
      y_pred: float
          predict rating value
      y_true: float
          true rating value
    
    Returns: 
    
    mae: float
        mean absolute error between predict rating values and true ratings values
    """
    return np.absolute(y_pred - y_true).mean()

In [12]:
def train_test_split(X, test_split=0.2):
    """
    Split dataset in train and test.

    Args:
      X: np.array
          ratings 
      test_split: float
          percentage of data for the test dataset 
    
    Returns: 
    
    train: np.array
        train dataset
    test:
        test dataset
    """
    
    size = int(X.shape[0])
    all_indexes = list(range(size))
                       
    indexes_test = list(np.random.choice(np.arange(0,size), int(size*test_split), replace=False))
    indexes_train = set(all_indexes) - set(indexes_test)
    
    train = X.loc[indexes_train].reset_index(drop=True)
    test = X.loc[indexes_test].reset_index(drop=True)
    
    return train, test

In [13]:
def clip(pred, max_pred=5, min_pred=1):
    """
    Clip predict values in range [min_pred, max_pred].

    Args:
      pred : float
          predict value
      max_pred : int
          max value of rating
      min_pred : int
          min value of rating

    Returns: 

    pred: int
        predict value cliped
    """
    
    pred = max_pred if pred > max_pred else pred
    pred = min_pred if pred < min_pred else pred
    
    return pred

In [14]:
def update_weights(
    pu, qi, user, item, eui, num_factors, alpha=0.1, lamb=0.02):
    """
    Update user and items matrix weights.

    Args:
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      user: str
          user code
      item: str
          item code
      eui: float
          predict error
      num_factor: int
          number of latent factors
      alpha: float
          learning rate

    Returns: 

    pu: np.array
        user matrix updated
    qi: np.array
          item matrix updated
    """
    
    for k in range(num_factors):
        
        puf = alpha * (eui * qi[k, item] - lamb * pu[user, k])
        qif = alpha * (eui * pu[user, k] - lamb * qi[k, item]) 
        
        pu[user, k] +=  puf
        qi[k, item] +=  qif       
            
    return pu, qi

In [15]:
def update_bias(
    bu, bi, user, item, eui, alpha, lamb):
    """"
    Update bias weights.

    Args:
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      user: str
          user code
      item: str
          item code
      eui: float
          predict error
      num_factor: int
          number of latent factors
      alpha: float
          learning rate

    Returns: 

    pu: np.array
        user matrix updated
    qi: np.array
          item matrix updated
    """
    
    bu[user] += alpha * (eui * bu[user] - lamb * bi[item])
    bi[item] += alpha * (eui * bi[item] - lamb * bu[user])
    
    return bu, bi

In [16]:
def one_svd_predict(
    pu, qi, bu, bi, user, item, num_factors, mean):
    """
    Predict rating to a pair user item.

    Args:
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      user: str
          user code
      item: str
          item code
      num_factor: int
          number of latent factors
      mean: float
          mean ratings of all users

    Returns: 

    float: predicted value      
    """
    
    return sum([pu[user, k] * qi[k, item] for k in range(num_factors)]) + mean + bu[user] + bi[item]

In [17]:
def svd_predict(
    X, pu, qi, bu, bi, num_factors, mean_ratings, max_pred=5, min_pred=1):
    """
    Predict rating for all pairs users items.

    Args:
      X: np.array
          rating matrix
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      num_factors: int
          number of latent factors
      mean: float
          mean ratings of all users
      max_pred: int
          highest possible prediction
      min_pred: int
          lowest possible prediction

    Returns: 

    y_pred: np.array
        predict values
    y_true: np.array
        targets values
    """
    
    y_pred = []
    y_true = []
    
    for idx in range(X.shape[0]):
        
        user, item, rui = X[idx, 0], X[idx, 1], X[idx, 2]

        mean = mean_ratings.get(item)

        pred = one_svd_predict(pu, qi, bu, bi, user, item, num_factors, mean)
        
        y_pred.append(pred)
        y_true.append(rui)
    
    return np.asarray(y_pred), np.asarray(y_true)

In [18]:
def inicialization(n, m, num_factors):
    """
    Initialize weights.

    Args:
      n: int
          number of unique users in ratings data
      m: int
          number of unique itens in ratings data
      num_factors: int
          number of latent factors

    Returns: 

    pu: np.array
        user matrix random initialize
    qi: np.array
        item matrix random initialize
    bu: np.array
        bias vector for user weights initialize with zeros
    bi: np.array
        bias vector for item weights initialize with zeros
    """
    
    pu = np.random.normal(0, .1, (n, num_factors))
    qi = np.random.normal(0, .1, (m, num_factors))
    
    bu = np.zeros(n)
    bi = np.zeros(m)
    
    return pu, qi, bu, bi  

In [19]:
def scheduler(epoch, alpha):
    """
    Learning rating scheduler.
    
    Args:
      epoch: int
          actual epoch in training
      alpha: int
          learning rating

    Returns: 

    learning rate update 
    """
    
    if epoch < 2:
        return alpha
    else:
        return 0.05

In [20]:
def SGD(
    X, num_factors, n, m, mean_ratings, alpha=0.001, lamb=0.002):
    """
    Stochastic Gradiend Descent.

    Args:
      X: np.array
          ratings data 
      num_factor: int
          number of latent factors
      n: int
          number of unique users in ratings data
      m: int
          number of unique itens in ratings data
      alpha: float
          learning rate

    Returns: 

    pu: np.array
        user matrix factored
    qi: np.array
        item matrix factored
    bu: np.array
        bias vector for user weights
    bi: np.array
        bias vector for item weights
    """
    
    pu, qi, bu, bi = inicialization(n, m, num_factors)
    
    qi = qi.T
    
    for idx in range(X.shape[0]):
        
        user, item, rui = X[idx, 0], X[idx, 1], X[idx, 2]
        
        # get user mean ratings
        mean = mean_ratings.get(item)

        #predict rating
        pred = one_svd_predict(pu, qi, bu, bi, user, item, num_factors, mean)
        
        #calculate error
        eui = rui - pred
        
        #update bias
        bu, bi = update_bias(bu, bi, user, item, eui, alpha, lamb)
        
        #Adjust weights
        pu, qi = update_weights(pu, qi, user, item, eui, num_factors, alpha, lamb)
        
        
    return pu, qi, bu, bi

In [21]:
def fit(
    X_train, mean_ratings, num_factors, n, m, alpha=0.001, lamb=0.002, epochs=20, verbose=False):
    """
    Fit Stochastic Gradiend Descent.

    Args:
      X_train: np.array
          ratings data used to create the factored matrixes
      num_factor: int
          number of latent factors
      n: int
          number of unique users in ratings data
      m: int
          number of unique itens in ratings data
      alpha: float
          learning rate
      epochs: int
          number of steps 
      verbose: boolean
          if true, show error values at all steps of training

    Returns: 

    pu: np.array
        user matrix factored
    qi: np.array
        item matrix factored
    bu: np.array
        fitted user bias vector
    bi: np.array
        fitted item bias vector
    rmse: float
        root mean squared error between predict rating values and true ratings values
    mae: float
        mean absolute error between predict rating values and true ratings values
    """
    
    for epoch in range(epochs):
        
        #alpha = scheduler(epoch, alpha)
        
        pu, qi, bu, bi = SGD(X_train, num_factors, n, m, mean_ratings, alpha=alpha, lamb=lamb)
        
        y_pred, y_true = svd_predict(X_train, pu, qi, bu, bi, num_factors, mean_ratings)
        
        rmse = calc_rmse(y_pred, y_true)
        mae = calc_mae(y_pred, y_true)
        
        if rmse < 0.01:
            break
            
        if verbose:
            print("Epoch: {} - RMSE: {:.5f} - MAE: {:.5f}".format(epoch, rmse, mae))
            
    return pu, qi, bu, bi, rmse, mae

In [22]:
def create_df (df, users, items, user_column='UserId', item_column='ItemId'):
    """
    Create a new rating dataframe where all users and itens are mapped for a continuous integer value
    and.

    Args:
      df: pd.DataFrame
          ratings_df
      user: list
          list with unique users code
      items: str
          list with unique items code
      user_column: str, defaul UserId
          column name of users
      item_column: str, default ItemId
          column name of items

    Returns: 

    df: 
        pandas DataFrame with all users items ratings
    dict_users: 
        dictionary mapped users and your new code
    dict_items: 
        dictionary mapped items and your new code     
    """
    
    dict_users = dict(zip(users, range(len(users))))
    dict_items = dict(zip(items, range(len(items))))

    df[user_column] = df[user_column].map(dict_users)
    df[item_column] = df[item_column].map(dict_items)

    df = df.fillna(-1)
    
    return np.asarray(df), df, dict_users, dict_items

In [35]:
def agg_ratings(df, dict_ratings, agg_metric='mean', agg_by='ItemId', rating_column='Rating'):
    
    grouped_ratings = df.groupby(agg_by)
    
    if agg_metric == "mean":
        mean_ratings = grouped_ratings.mean().reset_index()
    elif agg_metric == "median":
        mean_ratings = grouped_ratings.mean().reset_index()
        
    mean_ratings[agg_by] = mean_ratings[agg_by].map(dict_ratings)
    
    dict_mean_ratings = dict(zip(mean_ratings[agg_by], mean_ratings[rating_column]))
    
    return mean_ratings, dict_mean_ratings

In [36]:
def main_matrix_factorization(
    train_df, num_factors=100, alpha=0.001, lamb=0.002, epochs=20,
    agg_metric='mean', agg_by='ItemId',
    user_column='UserId', item_column='ItemId', rating_column='Rating', verbose=False):
    """
        1. Format df
        2. Matrix Factorization

    Args:
      X_train: np.array
          ratings data used to create the factored matrixes
      num_factors: int
          number of latent factors
      alpha: float
          learning rate
      lamb: float
          regularization factor
      epochs: int
          number of steps 
      user_column: str, defaul UserId
          column name of users
      item_column: str, default ItemId
          column name of items
      verbose: boolean, default False
          if true, show all steps

    Returns: 

    pu: np.array
        user matrix factored
    qi: np.array
        item matrix factored
    bu: np.array
        user bias vector
    bi: np.array
        item bias vector
    dict_users: 
        dictionary mapped users and your new code
    dict_items: 
        dictionary mapped items and your new code 
    r_df: pd.DataFrame
        Format data
    rmse: float
        root mean squared error between predict rating values and true ratings values
    mae: float
        mean absolute error between predict rating values and true ratings values
    """
    
    users = pd.unique(train_df[user_column]).tolist()
    items = pd.unique(train_df[item_column]).tolist()

    num_users = len(users)
    num_items = len(items)

    if verbose:
        print("\tFormatting data")
        
    r, r_df, dict_users, dict_items = create_df (
        train_df.copy(), users, items, user_column=user_column, item_column=item_column)

    if verbose:
        print("\tAggregating ratings")
        
    _, dict_mean_ratings = agg_ratings(
        train_df, dict_items, agg_metric=agg_metric, agg_by=agg_by, rating_column=rating_column)
      
    if verbose:
        print("\tFit SVD... Please waiting...\n")
        
    pu, qi, bu, bi, rmse, mae = fit(
        r, mean_ratings=dict_mean_ratings, num_factors=num_factors, n=num_users,
        m=num_items, alpha=alpha, lamb=lamb, epochs=epochs, verbose=verbose)
    
    return pu, qi, bu, bi, dict_users, dict_items, dict_mean_ratings, r_df, rmse, mae

In [43]:
def predict_one_rating(row, pu, qi, bu, bi, dict_users, dict_items, mean_ratings, agg_by="item"):
    """
    Predict rating for a pair user-item

    Args:
      row: pd.Series
          line of pandas df
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      bu: np.array
          user bias vector
      bi: np.array
          item bias vector
      user_mean: float
          mean ratings of all users

    Returns: 

    pred: float
        predicted value      
    """
    
    user_target = dict_users.get(str(row[0]))
    item_target = dict_items.get(str(row[1]))
    
    if agg_by == 'item':
        user_mean = mean_ratings.get(item_target)
    elif agg_by == 'user':
        user_mean = mean_ratings.get(user_target)
        
    pred = np.dot(pu[user_target], qi[item_target])
    pred = clip(pred + user_mean + bu[user_target] + bi[item_target])
    
    return pred

In [45]:
def predict_all_ratings(
    test,  mean_ratings, pu, qi, bu, bi, dict_users, dict_items,
    user_column='UserId', item_column='ItemId', rating_column='Rating', agg_by="item"):
    """
    Predict ratings for all pair user-item in dataframe.

    Args:
      test: pd.DataFrame, columns default, [UserId, ItemId]
          train data
      user_mean: float
          mean ratings of all users
      train: pd.DataFrame, columns default, [UserId, ItemId, Ratings]
          train data
      pu: np.array
          user matrix
      qi: np.array
          item matrix
      bu: np.array
          user bias vector
      bi: np.array
          item bias vector
      dict_users: float
          mean ratings of all users
      dict_users: float
          mean ratings of all users
      user_column: str, defaul UserId
          column name of users
      item_column: str, default ItemId
          column name of items

    Returns: 

    DataFrame with all predict values     
    """
    
    test['ui'] = list(zip(test[user_column], test[item_column]))
    
    vfunc = np.vectorize(
        predict_one_rating,
        excluded=['pu', 'qi', 'bu', 'bi', 'dict_users', 'dict_items', 'mean_ratings', 'agg_by'])
    
    test[rating_column] = vfunc(
        row=test['ui'], pu=pu, qi=qi, bu=bu, bi=bi, dict_users=dict_users,
        dict_items=dict_items, mean_ratings=mean_ratings, agg_by=agg_by)
    
    return test[[user_column, item_column, rating_column]]

In [27]:
def round_(x):
    
    return int(x + 0.5)

In [28]:
def format_and_save(
    result, name_file='out.csv', user_column='UserId', item_column='ItemId',
    rating_column='Rating', round_preds=False):
    
    if round_preds:
        result[rating_column] = result[rating_column].apply(round_)
    else:
        result[rating_column] = result[rating_column]
        
    result[item_column] = result[item_column] + "," + result[rating_column].astype(str)
    my_numpy = result[[user_column, item_column]].to_numpy()
    np.savetxt(name_file, my_numpy, fmt='%s', delimiter=':', header='UserId:ItemId,Rating')
    

In [57]:
def main(name_train='ratings', name_test='targets', user_column='UserId',
         item_column='ItemId', rating_column='Rating', path_to_read='data', verbose=True):
    
    start_time = time.time()
    
    columns = '{}:{}'.format(user_column, item_column)
    
    if verbose:
        print("Read datasets")
        
    train = pd.read_csv("{}/{}.csv".format(path_to_read, name_train))
    train[columns] = train[columns].str.split(':')
    train_df = pd.DataFrame(train[columns].to_list(), columns=[user_column, item_column])
    train_df[rating_column] = train[rating_column]

    test = pd.read_csv("{}/{}.csv".format(path_to_read, name_test))
    test[columns] = test[columns].str.split(':')
    test_df = pd.DataFrame(test[columns].to_list(), columns=[user_column, item_column])

    if verbose:
        print("Matrix Factorization")
        
    pu, qi, bu, bi, dict_users, dict_items, dict_mean_ratings, r_df, rmse, mae = main_matrix_factorization(
        train_df, num_factors=15, alpha=0.001, epochs=3, verbose=verbose)
    qi = qi.T
    
    if verbose: 
        print("Predict Ratings")
        
    predictions = predict_all_ratings(
        test_df, dict_mean_ratings, pu, qi, bu, bi, dict_users=dict_users,
        dict_items=dict_items, user_column=user_column, item_column=item_column, rating_column=rating_column)

    if verbose:
        print("Save Results")
        
    format_and_save(predictions)
    
    elapsed_time = (time.time() - start_time)/60
    print("Time executioon: {} minutes".format(elapsed_time))

In [None]:
main()

Read datasets
Matrix Factorization
	Formatting data
	Aggregating ratings
	Fit SVD... Please waiting...

