# Matrix Factorization
credit to [Predicting Anime Ratings using Matrix Factorization](https://jovian.ai/aakanksha-ns/anime-ratings-matrix-factorization)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt
from vega_datasets import data
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse

In [2]:
df_review = pd.read_json('yelp_dataset/yelp_academic_dataset_review.json', lines=True)
df_review = df_review[["user_id", "business_id", "stars"]]

In [3]:
# df_review = df_review.sample(1000000)

In [4]:
df_review.head(3)

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3


In [5]:
Counter(df_review.stars)

Counter({3: 691934, 5: 3231627, 4: 1452918, 1: 1069561, 2: 544240})

## Data Preprocessing

### Indexing users and businesses

In [6]:
def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

def encode_df(df_review):
    """Encodes rating data with continuous user and anime ids"""
    
    buz_ids, df_review['business_id'], num_buz = encode_column(df_review['business_id'])
    user_ids, df_review['user_id'], num_users = encode_column(df_review['user_id'])
    return df_review, num_users, num_buz, user_ids, buz_ids

In [7]:
train_df, test_df = train_test_split(df_review, test_size=0.2)
train_df = train_df.reset_index()[["user_id", "business_id", "stars"]]
test_df = test_df.reset_index()[["user_id", "business_id", "stars"]]
test_df = test_df.reset_index()[["user_id", "business_id", "stars"]]

In [41]:
train_df.shape[0]

5592224

In [8]:
train_df, num_users, num_buz, user_ids, buz_ids = encode_df(train_df)
print("Number of users :", num_users)
print("Number of businesses :", num_buz)
train_df.head()

Number of users : 1746531
Number of businesses : 150340


Unnamed: 0,user_id,business_id,stars
0,0,0,5
1,1,1,5
2,2,2,4
3,3,3,5
4,4,4,5


In [6]:
um_size = 1746531 * 150340
non_zero = 5592224
print(f"Sparsity of utility matrix in training: {(um_size - non_zero)/um_size:%}")

Sparsity of utility matrix in training: 99.997870%


In [9]:
def encode_new_data(valid_df, user_ids, anime_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = valid_df['business_id'].isin(anime_ids.keys()) & valid_df['user_id'].isin(user_ids.keys())
    valid_df = valid_df[df_val_chosen]
    valid_df['business_id'] =  np.array([anime_ids[x] for x in valid_df['business_id']])
    valid_df['user_id'] = np.array([user_ids[x] for x in valid_df['user_id']])
    return valid_df

In [10]:
print("before encoding:", test_df.shape)
test_df = encode_new_data(test_df, user_ids, buz_ids)
print("after encoding:", test_df.shape)

before encoding: (1398056, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['business_id'] =  np.array([anime_ids[x] for x in valid_df['business_id']])


after encoding: (1140887, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['user_id'] = np.array([user_ids[x] for x in valid_df['user_id']])


In [40]:
1398056 - 1140887

257169

## Training

Initializing user and item embeddings

In [11]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

Creating sparse utility matrix

In [12]:
def create_sparse_matrix(df, rows, cols, column_name="stars"):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((df[column_name].values,(df['user_id'].values, df['business_id'].values)),shape=(rows, cols))

In [13]:
Y = create_sparse_matrix(train_df, num_users, num_buz)

In [14]:
def predict(df, emb_user, emb_buz):
    """ This function computes df["prediction"] without doing (U*V^T).
    
    Computes df["prediction"] by using elementwise multiplication of the corresponding embeddings and then 
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_buz[df['business_id']],emb_user[df['user_id']]), axis=1)
    return df

In [15]:
def cost(df, emb_user, emb_buz):
    """ Computes mean square error"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_buz.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_buz), emb_user.shape[0], emb_buz.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0] 

In [16]:
def gradient(df, emb_user, emb_buz, lmbda=0.0002):
    """ Computes the gradient for user and anime embeddings"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_buz.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_buz), emb_user.shape[0], emb_buz.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_buz) + 2*lmbda*emb_user
    grad_anime = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_buz
    return grad_user, grad_anime

In [17]:
def gradient_descent(df, emb_user, emb_buz, iterations=2000, learning_rate=0.01, df_val=None, lmbda=0.0002):
    """ 
    Computes gradient descent with momentum (0.9) for given number of iterations.
    emb_user: the trained user embedding
    emb_buz: the trained business embedding
    """
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_buz.shape[0])
    beta = 0.9
    grad_user, grad_anime = gradient(df, emb_user, emb_buz)
    v_user = grad_user
    v_anime = grad_anime
    metrics = []
    for i in range(iterations):
        metric_record = {}
        grad_user, grad_anime = gradient(df, emb_user, emb_buz, lmbda)
        v_user = beta*v_user + (1-beta)*grad_user
        v_anime = beta*v_anime + (1-beta)*grad_anime
        emb_user = emb_user - learning_rate*v_user
        emb_buz = emb_buz - learning_rate*v_anime
        if (not (i+1)%20) or i==0:
            metric_record["iteration"] = i + 1
            print("\niteration", i+1, ":")
            loss_train = cost(df, emb_user, emb_buz)
            metric_record["loss_train"] = loss_train
            print("train mse:", loss_train)
            if df_val is not None:
                loss_val = cost(df_val, emb_user, emb_buz)
                metric_record["loss_val"] = loss_val
                print("validation mse:", loss_val)
            metrics.append(metric_record)
    metrics = pd.DataFrame(data=metrics)
    return emb_user, emb_buz, metrics

In [18]:
%%time
emb_user = create_embeddings(num_users, 3)
emb_buz = create_embeddings(num_buz, 3)
emb_user, emb_buz, mf_metrics = gradient_descent(train_df, emb_user, emb_buz, iterations=1000, learning_rate=2, df_val=test_df)


iteration 1 :
train mse: 73.1280641684877
validation mse: 68.69694094661259

iteration 20 :
train mse: 61.87827358335583
validation mse: 58.0312416294933

iteration 40 :
train mse: 52.95455411977385
validation mse: 49.571662839403686

iteration 60 :
train mse: 45.8984362271514
validation mse: 42.891359580022744

iteration 80 :
train mse: 40.1096588579106
validation mse: 37.42037781863893

iteration 100 :
train mse: 35.26162516379324
validation mse: 32.84646947874954

iteration 120 :
train mse: 31.147275416072425
validation mse: 28.97132741594937

iteration 140 :
train mse: 27.623054261262673
validation mse: 25.657493181216157

iteration 160 :
train mse: 24.583658244773908
validation mse: 22.804205039189178

iteration 180 :
train mse: 21.94887477313825
validation mse: 20.334775526355635

iteration 200 :
train mse: 19.65591438475341
validation mse: 18.189231574026422

iteration 220 :
train mse: 17.654586186974306
validation mse: 16.319682107390708

iteration 240 :
train mse: 15.90409324

In [20]:
mf_metrics.to_json("mf_metrics.json", orient="records")

In [21]:
mf_metrics = pd.read_json("mf_metrics.json")

In [35]:
chart = alt.Chart(mf_metrics.melt("iteration")).mark_line().encode(
    x="iteration",
    y=alt.Y("value", title="MSE"),
    color="variable",
)
chart

## Prediction

In [24]:
train_mse = cost(train_df, emb_user, emb_buz)
val_mse = cost(test_df, emb_user, emb_buz)
print(train_mse, val_mse)

6.549223817460769 6.318851820949318


In [25]:
test_df

Unnamed: 0,user_id,business_id,stars,prediction
0,209518,10362,3,1.542711
1,983640,29061,5,2.426133
2,780252,78722,5,5.333369
3,168580,18625,5,2.220688
5,384367,96741,5,2.555199
...,...,...,...,...
1398051,35322,14394,4,1.900280
1398052,54534,58163,4,2.437889
1398053,569440,2277,2,0.925534
1398054,1035269,26214,5,0.944470


## Hyperparameter Tuning
* lmbda
* lr
* embedding size
* iterations