# Implicit matrix factorization

In [1]:
!ls

01_explict_CF_intro.ipynb	04_lightFM.ipynb  ml-100k      rec-a-sketch
02_explicit_ALS_SGD.ipynb	explict_CF.py	  __pycache__
03_implict_ALS_sketchlab.ipynb	helpers.py	  README.md


In [2]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.2.6.tar.gz (260kB)
[K    100% |████████████████████████████████| 266kB 2.4MB/s 
Building wheels for collected packages: implicit
  Running setup.py bdist_wheel for implicit ... [?25l- \ | / - done
[?25h  Stored in directory: /home/ihong/.cache/pip/wheels/b8/72/24/e572345d776fb340193d0dd3b902a9f81ad39de7b9d61387ef
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.2.6


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import pickle
import csv
import implicit
import itertools
import copy

In [4]:
!head -5 ./rec-a-sketch/model_feats.psv

mid|type|value
5dcebcfaedbd4e7b8a27bd1ae55f1ac3|category|Characters
5dcebcfaedbd4e7b8a27bd1ae55f1ac3|category|Gaming
5dcebcfaedbd4e7b8a27bd1ae55f1ac3|tag|3dsmax
5dcebcfaedbd4e7b8a27bd1ae55f1ac3|tag|noel


In [3]:
df = pd.read_csv('./rec-a-sketch/model_likes_anon.psv',
                 sep='|', quoting=csv.QUOTE_MINIMAL,
                 quotechar='\\')
df.head()

Unnamed: 0,modelname,mid,uid
0,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,7ac1b40648fff523d7220a5d07b04d9b
1,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,2b4ad286afe3369d39f1bb7aa2528bc7
2,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,1bf0993ebab175a896ac8003bed91b4b
3,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,6484211de8b9a023a7d9ab1641d22e7c
4,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,1109ee298494fbd192e27878432c718a


In [4]:
print('Duplicated rows: ' + str(df.duplicated().sum()))
print('That\'s weird - let\'s just drop them')
df.drop_duplicates(inplace=True)

Duplicated rows: 155
That's weird - let's just drop them


In [5]:
df = df[['uid', 'mid']]
df.head()

Unnamed: 0,uid,mid
0,7ac1b40648fff523d7220a5d07b04d9b,5dcebcfaedbd4e7b8a27bd1ae55f1ac3
1,2b4ad286afe3369d39f1bb7aa2528bc7,5dcebcfaedbd4e7b8a27bd1ae55f1ac3
2,1bf0993ebab175a896ac8003bed91b4b,5dcebcfaedbd4e7b8a27bd1ae55f1ac3
3,6484211de8b9a023a7d9ab1641d22e7c,5dcebcfaedbd4e7b8a27bd1ae55f1ac3
4,1109ee298494fbd192e27878432c718a,5dcebcfaedbd4e7b8a27bd1ae55f1ac3


In [6]:
n_users = df.uid.unique().shape[0]
n_items = df.mid.unique().shape[0]

print('Number of users: {}'.format(n_users))
print('Number of models: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(float(df.shape[0]) / float(n_users*n_items) * 100))

Number of users: 62583
Number of models: 28806
Sparsity: 0.035%


In [7]:
print('starting shape: {}'.format(df.shape[0]))
mid_min = 3
mid_counts = df.groupby('uid').mid.count()
# df.uid.isin()
df = df[~df.uid.isin(mid_counts[mid_counts < mid_min].index.tolist())]
print('iter shape :{}'.format(df.shape[0]))

starting shape: 632677
iter shape :584307


In [8]:
def threshold_likes(df, uid_min, mid_min):
    n_users = df.uid.unique().shape[0]
    n_items = df.mid.unique().shape[0]
    sparsity = float(df.shape[0]) / float(n_users*n_items) * 100
    print('Starting likes info')
    print('Number of users: {}'.format(n_users))
    print('Number of models: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))
    
    done = False
    while not done:
        starting_shape = df.shape[0]
        mid_counts = df.groupby('uid').mid.count()
        df = df[~df.uid.isin(mid_counts[mid_counts < mid_min].index.tolist())]
        uid_counts = df.groupby('mid').uid.count()
        df = df[~df.mid.isin(uid_counts[uid_counts < uid_min].index.tolist())]
        ending_shape = df.shape[0]
        if starting_shape == ending_shape:
            done = True
    
    assert(df.groupby('uid').mid.count().min() >= mid_min)
    assert(df.groupby('mid').uid.count().min() >= uid_min)
    
    n_users = df.uid.unique().shape[0]
    n_items = df.mid.unique().shape[0]
    sparsity = float(df.shape[0]) / float(n_users*n_items) * 100
    print('Ending likes info')
    print('Number of users: {}'.format(n_users))
    print('Number of models: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))
    return df


In [9]:
df_lim = threshold_likes(df, 5, 5)

Starting likes info
Number of users: 23055
Number of models: 28769
Sparsity: 0.088%
Ending likes info
Number of users: 15274
Number of models: 25655
Sparsity: 0.140%


In [10]:
df_lim.head()

Unnamed: 0,uid,mid
1,2b4ad286afe3369d39f1bb7aa2528bc7,5dcebcfaedbd4e7b8a27bd1ae55f1ac3
2,1bf0993ebab175a896ac8003bed91b4b,5dcebcfaedbd4e7b8a27bd1ae55f1ac3
4,1109ee298494fbd192e27878432c718a,5dcebcfaedbd4e7b8a27bd1ae55f1ac3
6,8626c70d4b85af57804a8fc1173cbbe0,5dcebcfaedbd4e7b8a27bd1ae55f1ac3
7,e1527fdfa8782e70d499e177efc28605,5dcebcfaedbd4e7b8a27bd1ae55f1ac3


In [11]:
# Create mappings
mid_to_idx = {}
idx_to_mid = {}
for (idx, mid) in enumerate(df_lim.mid.unique().tolist()):
    mid_to_idx[mid] = idx
    idx_to_mid[idx] = mid
    
uid_to_idx = {}
idx_to_uid = {}
for (idx, uid) in enumerate(df_lim.uid.unique().tolist()):
    uid_to_idx[uid] = idx
    idx_to_uid[idx] = uid

In [12]:

# # Don't do this!
# num_users = df_lim.uid.unique().shape[0]
# num_items = df_lim.mid.unique().shape[0]
# likes = sparse.csr_matrix((num_users, num_items), dtype=np.float64)
# for row in df_lim.itertuples():
#     likes[uid_to_idx[uid], mid_to_idx[row.mid]] = 1.0



In [13]:
def map_ids(row, mapper):
    return mapper[row]
I = df_lim.uid.apply(map_ids, args=[uid_to_idx]).as_matrix()
J = df_lim.mid.apply(map_ids, args=[mid_to_idx]).as_matrix()
V = np.ones(I.shape[0])
likes = sparse.coo_matrix((V, (I, J)), dtype=np.float64)
likes = likes.tocsr()

In [14]:
likes

<15274x25655 sparse matrix of type '<class 'numpy.float64'>'
	with 547477 stored elements in Compressed Sparse Row format>

In [15]:
def train_test_split(ratings, split_count, fraction=None):
    """
    Split recommendation data into train and test sets
    
    Params
    ------
    ratings : scipy.sparse matrix
        Interactions between users and items.
    split_count : int
        Number of user-item-interactions per user to move
        from training to test set.
    fractions : float
        Fraction of users to split off some of their
        interactions into test set. If None, then all 
        users are considered.
    """
    # Note: likely not the fastest way to do things below.
    train = ratings.copy().tocoo()
    test = sparse.lil_matrix(train.shape)
    
    if fraction:
        try:
            user_index = np.random.choice(
                np.where(np.bincount(train.row) >= split_count * 2)[0], 
                replace=False,
                size=np.int32(np.floor(fraction * train.shape[0]))
            ).tolist()
        except:
            print(('Not enough users with > {} '
                  'interactions for fraction of {}')\
                  .format(2*k, fraction))
            raise
    else:
        user_index = range(train.shape[0])
        
    train = train.tolil()

    for user in user_index:
        test_ratings = np.random.choice(ratings.getrow(user).indices, 
                                        size=split_count, 
                                        replace=False)
        train[user, test_ratings] = 0.
        # These are just 1.0 right now
        test[user, test_ratings] = ratings[user, test_ratings]
   
    
    # Test and training are truly disjoint
    assert(train.multiply(test).nnz == 0)
    return train.tocsr(), test.tocsr(), user_index

In [16]:
train, test, user_index = train_test_split(likes, 5, fraction=0.2)

In [17]:
print('test:\n',repr(test))
print('train:\n',repr(train))

test:
 <15274x25655 sparse matrix of type '<class 'numpy.float64'>'
	with 15270 stored elements in Compressed Sparse Row format>
train:
 <15274x25655 sparse matrix of type '<class 'numpy.float64'>'
	with 532207 stored elements in Compressed Sparse Row format>


## Use Implicit pack
how to use it correctly?

In [18]:
train

<15274x25655 sparse matrix of type '<class 'numpy.float64'>'
	with 532207 stored elements in Compressed Sparse Row format>

In [19]:
import implicit

# initialize a model
model = implicit.als.AlternatingLeastSquares()

# train the model on a sparse matrix of item/user/confidence weights
model.fit(train.T)

In [20]:
recommendations = model.recommend(0, train)

# find related items
# related = model.similar_items(itemid)

In [21]:
recommendations

[(4, 0.26038048368182715),
 (28, 0.20362918647437764),
 (6, 0.17393755345022127),
 (23, 0.15773390750492552),
 (31, 0.14353614149243618),
 (74, 0.14055459096520817),
 (34, 0.13993634998822405),
 (97, 0.12577477254291131),
 (76, 0.11455228786978935),
 (79, 0.10900009081245543)]

In [22]:
model.similar_items(0)

[(0, 0.99999999999999967),
 (28, 0.74844906145914203),
 (15855, 0.68259952386664269),
 (170, 0.62208315065138031),
 (20259, 0.61608284512726341),
 (17377, 0.56136352092980135),
 (45, 0.54759123923997621),
 (19522, 0.54071373220662455),
 (1212, 0.53081024063120963),
 (19400, 0.50551012842925658)]

## Cross-Validation

Grid-Search for hyper-parameters :

1. `num_factors` : the numbers of latent factors
2. `regularization`: Scale of regularizer for both user,item
3. `alpha`: confidence scaling term
4. `iteration`: iteration numbers

In [23]:
model.item_factors.shape

(25655, 100)

In [24]:
model.user_factors.shape

(15274, 100)

In [25]:
from sklearn.metrics import mean_squared_error
def calculate_mse(model, ratings, user_index=None):
    """Recommend products for all customers"""
    preds = model.user_factors.dot(model.item_factors.T)
#     preds = model.predict_for_customers()
    if user_index:
        return mean_squared_error(ratings[user_index, :].toarray().ravel(),
                                  preds[user_index, :].ravel())
    
    return mean_squared_error(ratings.toarray().ravel(),
                              preds.ravel())

In [26]:
def precision_at_k(model, ratings, k=5, user_index=None):
    if not user_index:
        user_index = range(ratings.shape[0])
    ratings = ratings.tocsr()
    precisions = []
    # Note: line below may become infeasible for large datasets.
#     predictions = model.predict_for_customers()
    predictions = model.user_factors.dot(model.item_factors.T)
    for user in user_index:
        # In case of large dataset, compute predictions row-by-row like below
        # predictions = np.array([model.predict(row, i) for i in xrange(ratings.shape[1])])
        top_k = np.argsort(-predictions[user, :])[:k]
        labels = ratings.getrow(user).indices
        precision = float(len(set(top_k) & set(labels))) / float(k)
        precisions.append(precision)
    return np.mean(precisions)   

In [27]:
def print_log(row, header=False, spacing=12):
    top = ''
    middle = ''
    bottom = ''
    for r in row:
        top += '+{}'.format('-'*spacing)
        if isinstance(r, str):
            middle += '| {0:^{1}} '.format(r, spacing-2)
        elif isinstance(r, int):
            middle += '| {0:^{1}} '.format(r, spacing-2)
        elif isinstance(r, float):
            middle += '| {0:^{1}.5f} '.format(r, spacing-2)
        bottom += '+{}'.format('='*spacing)
    top += '+'
    middle += '|'
    bottom += '+'
    if header:
        print(top)
        print(middle)
        print(bottom)
    else:
        print(middle)
        print(top)

In [28]:
def learning_curve(model, train, test, epochs, k=5, user_index=None):
    if not user_index:
        user_index = range(train.shape[0])
    prev_epoch = 0
    train_precision = []
    train_mse = []
    test_precision = []
    test_mse = []
    
    headers = ['epochs', 'p@k train', 'p@k test',
               'mse train', 'mse test']
    print_log(headers, header=True)
    
    for epoch in epochs:
        model.iterations = epoch - prev_epoch
        if not hasattr(model, 'user_vectors'):
            model.fit(train)
        else:
            model.fit_partial(train)
        train_mse.append(calculate_mse(model, train, user_index))
        train_precision.append(precision_at_k(model, train, k, user_index))
        test_mse.append(calculate_mse(model, test, user_index))
        test_precision.append(precision_at_k(model, test, k, user_index))
        row = [epoch, train_precision[-1], test_precision[-1],
               train_mse[-1], test_mse[-1]]
        print_log(row)
        prev_epoch = epoch
    return model, train_precision, train_mse, test_precision, test_mse

In [29]:
def grid_search_learning_curve(base_model, train, test, param_grid,
                               user_index=None, patk=5, epochs=range(2, 40, 2)):
    """
    "Inspired" (stolen) from sklearn gridsearch
    https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py
    """
    curves = []
    keys, values = zip(*param_grid.items())
    for v in itertools.product(*values):
        params = dict(zip(keys, v))
        this_model = copy.deepcopy(base_model)
        print_line = []
        for k, v in params.items():
            setattr(this_model, k, v)
            print_line.append((k, v))

        print(' | '.join('{}: {}'.format(k, v) for (k, v) in print_line))
        _, train_patk, train_mse, test_patk, test_mse = learning_curve(this_model, train, test,
                                                                epochs, k=patk, user_index=user_index)
        curves.append({'params': params,
                       'patk': {'train': train_patk, 'test': test_patk},
                       'mse': {'train': train_mse, 'test': test_mse}})
    return curves

DO grid-search(!!! VERY SLOW --- take several days)

In [33]:
param_grid = {'num_factors': [20, 40, 80],
              'regularization': [0.0, 1e-5, 1e-3, 1e-1, 1e1, 1e2],
              'alpha': [1, 10, 50, 100, 500, 1000]}

In [31]:
## !export OPENBLAS_NUM_THREADS=1 ## windows

In [34]:
curves = grid_search_learning_curve(model, train, test,
                                    param_grid,
                                    user_index=user_index,
                                    patk=5)

regularization: 0.0 | num_factors: 20 | alpha: 1
+------------+------------+------------+------------+------------+
|   epochs   | p@k train  |  p@k test  | mse train  |  mse test  |
|     2      |  0.00380   |  0.00059   |  0.25246   |  0.25079   |
+------------+------------+------------+------------+------------+


KeyboardInterrupt: 

In [92]:
curves = grid_search_learning_curve(base_model, train, test,
                                    param_grid,
                                    user_index=user_index,
                                    patk=5)

regularization: 0.0 | num_factors: 10 | alpha: 1
+------------+------------+------------+------------+------------+
|   epochs   | p@k train  |  p@k test  | mse train  |  mse test  |


AttributeError: 'AlternatingLeastSquares' object has no attribute 'predict_for_customers'

In [34]:
model_base = implicit.als