In [None]:
!pip install lightfm
#!pip install pandas==1.1.0 as pd

Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/e9/8e/5485ac5a8616abe1c673d1e033e2f232b4319ab95424b42499fabff2257f/lightfm-1.15.tar.gz (302kB)
[K     |█                               | 10kB 17.7MB/s eta 0:00:01[K     |██▏                             | 20kB 2.2MB/s eta 0:00:01[K     |███▎                            | 30kB 2.8MB/s eta 0:00:01[K     |████▍                           | 40kB 3.1MB/s eta 0:00:01[K     |█████▍                          | 51kB 2.5MB/s eta 0:00:01[K     |██████▌                         | 61kB 2.8MB/s eta 0:00:01[K     |███████▋                        | 71kB 3.1MB/s eta 0:00:01[K     |████████▊                       | 81kB 3.4MB/s eta 0:00:01[K     |█████████▊                      | 92kB 3.4MB/s eta 0:00:01[K     |██████████▉                     | 102kB 3.5MB/s eta 0:00:01[K     |████████████                    | 112kB 3.5MB/s eta 0:00:01[K     |█████████████                   | 122kB 3.5MB/s eta 0:00:01[K  

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Necessary imports
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csr_matrix
from lightfm import LightFM
from lightfm.evaluation import auc_score
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import reciprocal_rank
import itertools


In [None]:
# Load the data

infile = open('/content/drive/My Drive/Master/Data/1Mdatasets.p', 'rb')
data = pickle.load(infile)
infile.close()

In [None]:
data['train']

<69877x3527 sparse matrix of type '<class 'numpy.int64'>'
	with 1689459 stored elements in COOrdinate format>

In [None]:
positives = np.where(np.array(data['train'].todense()) == 1)
positives = np.array(list(zip(positives[0], positives[1])))

In [None]:
rand = np.random.default_rng()
todrop = rand.integers(0,len(positives), 
                        size=int(len(positives)/2))

In [None]:
positives = np.delete(positives, todrop, axis=0)

In [None]:
new_train = np.zeros((69877, 3527), dtype=int)
for x, y in positives:
  new_train[x,y] = 1

In [None]:
new_train = csr_matrix(new_train)

In [None]:
new_train

<69877x3527 sparse matrix of type '<class 'numpy.longlong'>'
	with 1024442 stored elements in Compressed Sparse Row format>

In [None]:
# Data info
num_users = len(data['user_dict'].keys())
num_items = len(data['item_dict'].keys())
num_labels = data['item_features'].shape[1]
#num_tags = data['tags'].shape[1]
print(f'Users: {num_users}')
print(f'Items: {num_items}')
print(f'Labels: {num_labels}')
#print(f'Tags: {num_tags}')

Users: 6040
Items: 1514
Labels: 939


In [None]:
# Methods for finding the optimal hyperparameters
# @source: https://stackoverflow.com/questions/49896816/how-do-i-optimize-the-hyperparameters-of-lightfm

def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }


def random_search(train, test, item_features, num_samples=10, num_threads=1):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, 
                  item_features=item_features, 
                  epochs=num_epochs, 
                  num_threads=num_threads)

        score = reciprocal_rank(model,
                          test, 
                          train_interactions=train, 
                          item_features=item_features, 
                          num_threads=num_threads).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)

In [None]:
train = data['train']
test = data['test']
train_neg = data['train_neg']
test_neg = data['test_neg']
item_features = data['item_features']

evaluation_results = dict()

In [None]:
# Find the optimal hyperparameters
(score, hyperparams, model) = max(random_search(train, 
                                                test, 
                                                item_features, 
                                                num_threads=2), 
                                  key=lambda x: x[0])

print("Best score {} at {}".format(score, hyperparams))

Best score 0.8614028096199036 at {'no_components': 32, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.027724575479789855, 'item_alpha': 3.0564472985493036e-09, 'user_alpha': 2.8090807112176496e-09, 'max_sampled': 13, 'num_epochs': 11}


In [None]:
# Find the optimal hyperparameters
(score, hyperparams, model) = max(random_search(train, 
                                                test, 
                                                item_features, 
                                                num_threads=2), 
                                  key=lambda x: x[0])

print("Best score {} at {}".format(score, hyperparams))

Best score 0.33556807041168213 at {'no_components': 34, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.07488944297869128, 'item_alpha': 9.243578003830079e-09, 'user_alpha': 7.313654576476887e-09, 'max_sampled': 9, 'num_epochs': 44}


In [None]:
def evaluate(loss, train, test, item_features=None):
  # Define the hyperparameters
  NUM_THREADS = 2
  NUM_COMPONENTS = 21
  LEARNING_RATE = 0.06
  ITEM_ALPHA = 5.9e-08
  USER_ALPHA = 10e-09
  MAX_SAMPLED = 7
  NUM_EPOCHS = 25

  # Define the model
  model = LightFM(loss=loss,
                  learning_schedule='adagrad',
                  learning_rate=LEARNING_RATE,
                  user_alpha=USER_ALPHA,
                  item_alpha=ITEM_ALPHA,
                  max_sampled=MAX_SAMPLED,
                  no_components=NUM_COMPONENTS)
  # Fit the model.
  %time model = model.fit(train, item_features=item_features, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

  # Calculate AUC and P@K for the training set
  train_auc = auc_score(model,
                        train,
                        item_features=item_features,
                        num_threads=NUM_THREADS).mean()

  train_precision = precision_at_k(model, 
                                  train,
                                  item_features=item_features, 
                                  num_threads=NUM_THREADS).mean()

  train_recall = recall_at_k(model,
                        train,
                        item_features=item_features,
                        num_threads=NUM_THREADS).mean()

  train_reciprocal = reciprocal_rank(model, 
                                  train,
                                  item_features=item_features, 
                                  num_threads=NUM_THREADS).mean()
  # Calculate AUC and P@K for the test set
  test_auc = auc_score(model,
                      test,
                      train_interactions=train, 
                      item_features=item_features,
                      num_threads=NUM_THREADS).mean()

  test_precision = precision_at_k(model, 
                                  test,
                                  train_interactions=train,
                                  item_features=item_features, 
                                  k=5,
                                  num_threads=NUM_THREADS).mean()

  test_recall = recall_at_k(model,
                      test,
                      train_interactions=train, 
                      item_features=item_features,
                      k=5,
                      num_threads=NUM_THREADS).mean()

  test_reciprocal = reciprocal_rank(model, 
                                  test,
                                  train_interactions=train, 
                                  item_features=item_features, 
                                  num_threads=NUM_THREADS).mean()
  results = dict()
  results['model'] = model
  results['train_auc'] = train_auc
  results['train_precision'] = train_precision
  results['train_recall'] = train_recall
  results['train_reciprocal'] = train_reciprocal
  results['test_auc'] = test_auc
  results['test_precision'] = test_precision
  results['test_recall'] = test_recall
  results['test_reciprocal'] = test_reciprocal

  return results


In [None]:

%time evaluation_results['label_freq_warp'] = evaluate('warp', train, test, item_features=data['item_features'])
evaluation_results['label_freq_bpr'] = evaluate('bpr', train, test, item_features=data['item_features'])
evaluation_results['label_freq_logistic'] = evaluate('logistic', train_neg, test_neg, item_features=data['item_features'])

CPU times: user 2min 37s, sys: 47.2 ms, total: 2min 37s
Wall time: 1min 20s
CPU times: user 4min 29s, sys: 84.8 ms, total: 4min 29s
Wall time: 2min 16s
CPU times: user 3min 29s, sys: 62.4 ms, total: 3min 29s
Wall time: 1min 50s
CPU times: user 3min 8s, sys: 48.9 ms, total: 3min 8s
Wall time: 1min 35s


In [None]:
%time evaluation_results['label_conf_warp'] = evaluate('warp', train, test, item_features=data['conf_features'])
evaluation_results['label_conf_bpr'] = evaluate('bpr', train, test, item_features=data['conf_features'])
evaluation_results['label_conf_logistic'] = evaluate('logistic', train_neg, test_neg, item_features=data['conf_features'])

CPU times: user 2min 42s, sys: 37.9 ms, total: 2min 42s
Wall time: 1min 22s
CPU times: user 4min 34s, sys: 60.9 ms, total: 4min 34s
Wall time: 2min 18s
CPU times: user 3min 30s, sys: 68.7 ms, total: 3min 30s
Wall time: 1min 46s
CPU times: user 3min 7s, sys: 39.9 ms, total: 3min 7s
Wall time: 1min 39s


In [None]:
%time evaluation_results['genre_warp'] = evaluate('warp', train, test, item_features=data['genre_features'])
evaluation_results['genre_bpr'] = evaluate('bpr', train, test, item_features=data['genre_features'])
evaluation_results['genre_logistic'] = evaluate('logistic', train_neg, test_neg, item_features= data['genre_features'])

CPU times: user 15.2 s, sys: 11 ms, total: 15.2 s
Wall time: 7.78 s
CPU times: user 29.1 s, sys: 18 ms, total: 29.1 s
Wall time: 14.8 s
CPU times: user 16.6 s, sys: 14 ms, total: 16.7 s
Wall time: 8.47 s
CPU times: user 18 s, sys: 11 ms, total: 18 s
Wall time: 9.11 s


In [None]:
evaluation_results['tags_warp'] = evaluate('warp', train, test, item_features=data['tags'])
evaluation_results['tags_bpr'] = evaluate('bpr', train, test, item_features=data['tags'])
evaluation_results['tags_logistic'] = evaluate('logistic', train_neg, test_neg, item_features=data['tags'])

CPU times: user 15.1 s, sys: 0 ns, total: 15.1 s
Wall time: 7.75 s
CPU times: user 22.4 s, sys: 0 ns, total: 22.4 s
Wall time: 11.4 s
CPU times: user 22.5 s, sys: 0 ns, total: 22.5 s
Wall time: 11.4 s


In [None]:
%time evaluation_results['cf_warp'] = evaluate('warp', train, test)
evaluation_results['cf_bpr'] = evaluate('bpr', train, test)
evaluation_results['cf_logistic'] = evaluate('logistic', train, test)


CPU times: user 10.7 s, sys: 7.98 ms, total: 10.7 s
Wall time: 5.45 s
CPU times: user 21.7 s, sys: 13 ms, total: 21.7 s
Wall time: 11.1 s
CPU times: user 12.6 s, sys: 14 ms, total: 12.6 s
Wall time: 6.47 s
CPU times: user 7.67 s, sys: 3.99 ms, total: 7.67 s
Wall time: 3.89 s


In [None]:
pickle.dump(evaluation_results, (open('/content/drive/My Drive/Master/Data/10MColdUsersEvaluation.p', 'wb')))

In [None]:
%time evaluation_results['subs_warp'] = evaluate('warp', train, test, item_features=data['subs_features'])
%time evaluation_results['subs_bpr'] = evaluate('bpr', train, test, item_features=data['subs_features'])
%time evaluation_results['subs_logistic'] = evaluate('logistic', train_neg, test_neg, item_features=data['subs_features'])

CPU times: user 1h 15min 43s, sys: 1.19 s, total: 1h 15min 44s
Wall time: 38min 27s
CPU times: user 1h 54min 15s, sys: 1.84 s, total: 1h 54min 17s
Wall time: 58min 2s
CPU times: user 1h 20min 37s, sys: 1.24 s, total: 1h 20min 39s
Wall time: 40min 53s
CPU times: user 1h 59min 9s, sys: 1.85 s, total: 1h 59min 10s
Wall time: 1h 23s
CPU times: user 1h 10min 16s, sys: 1.03 s, total: 1h 10min 17s
Wall time: 35min 39s
CPU times: user 1h 50min 42s, sys: 1.7 s, total: 1h 50min 43s
Wall time: 56min 20s


In [None]:
pickle.dump(evaluation_results, (open('/content/drive/My Drive/Master/Data/1Mevaluation.p', 'wb')))

In [None]:
infile = open('/content/drive/My Drive/Master/Data/1Mevaluation.p', 'rb')
evaluation_results = pickle.load(infile)
infile.close()

In [None]:
evaluation_results



{'cf_bpr': {'model': <lightfm.lightfm.LightFM at 0x7f938469a5f8>,
  'test_auc': 0.8283109,
  'test_precision': 0.15252402,
  'test_recall': 0.10703395440290878,
  'test_reciprocal': 0.3504381,
  'train_auc': 0.92955387,
  'train_precision': 0.5139497,
  'train_recall': 0.27121806253925584,
  'train_reciprocal': 0.88829994},
 'cf_logistic': {'model': <lightfm.lightfm.LightFM at 0x7f937cb0c0b8>,
  'test_auc': 0.8657818,
  'test_precision': 0.12908296,
  'test_recall': 0.08271237881203101,
  'test_reciprocal': 0.30233666,
  'train_auc': 0.86526835,
  'train_precision': 0.2755964,
  'train_recall': 0.11429135599780017,
  'train_reciprocal': 0.5482946},
 'cf_warp': {'model': <lightfm.lightfm.LightFM at 0x7f937ca6c780>,
  'test_auc': 0.920886,
  'test_precision': 0.20887338,
  'test_recall': 0.14438073156552675,
  'test_reciprocal': 0.42816287,
  'train_auc': 0.9640199,
  'train_precision': 0.49073893,
  'train_recall': 0.26120070915685,
  'train_reciprocal': 0.7548461},
 'genre_bpr': {'mode

In [None]:
evaluation_results

{'cf_bpr': {'model': <lightfm.lightfm.LightFM at 0x7f7c386cb978>,
  'test_auc': 0.85921216,
  'test_precision': 0.12343063,
  'test_recall': 0.13169008184960684,
  'test_reciprocal': 0.30568025,
  'train_auc': 0.9627521,
  'train_precision': 0.4591892,
  'train_recall': 0.4009652344081713,
  'train_reciprocal': 0.86753994},
 'cf_logistic': {'model': <lightfm.lightfm.LightFM at 0x7f7c386cbe80>,
  'test_auc': 0.9382832,
  'test_precision': 0.10103702,
  'test_recall': 0.10801101037189173,
  'test_reciprocal': 0.25803545,
  'train_auc': 0.9392178,
  'train_precision': 0.21941386,
  'train_recall': 0.1526927937370082,
  'train_reciprocal': 0.45815992},
 'cf_warp': {'model': <lightfm.lightfm.LightFM at 0x7f7c386cb438>,
  'test_auc': 0.96734726,
  'test_precision': 0.1903408,
  'test_recall': 0.1941912592143658,
  'test_reciprocal': 0.406352,
  'train_auc': 0.9868507,
  'train_precision': 0.41289216,
  'train_recall': 0.3520498837347658,
  'train_reciprocal': 0.6985445},
 'genre_bpr': {'mode

In [None]:
# Define the hyperparameters
NUM_THREADS = 2
NUM_COMPONENTS = 21
LEARNING_RATE = 0.06
ITEM_ALPHA = 5.9e-08
USER_ALPHA = 10e-09
MAX_SAMPLED = 7
NUM_EPOCHS = 25


# Define the model
model = LightFM(loss='warp',
                learning_schedule='adagrad',
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
                max_sampled=MAX_SAMPLED,
                no_components=NUM_COMPONENTS)

# Fit the model.
%time model = model.fit(train,item_features=item_features,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)


CPU times: user 2min 53s, sys: 60.1 ms, total: 2min 53s
Wall time: 1min 27s


In [None]:
user_dict = data['user_dict']
def new_user():
  userId = max(user_d.keys())+1
  update_userdict(userId)
  user_obj = {'userId': userId,
            'interactions':[]}
  
  return user_obj

def update_userdict(newId):
  user_dict[newId] = user_dict[max(user_dict.keys())]+1

def add_interaction(movieId):
  

In [None]:
class Recommender:
  def __init__(model, interactions, item_features, user_dict, item_dict):
    self.model = model
    self.interactions = interactions
    self.item_features = item_features
    self.user_dict = user_dict
    self.item_dict = item_dict


  def get_model():
    return self.model

  def set_model(model):
    self.model = model

  def get_user_dict():
    return self.user_dict

  def set_user_dict(user_dict):
    self.user_dict = user_dict

  def new_user():
    
    userId = max(get_user_dict().keys())+1
    user_obj = {'userId': userId, 
                'interactions':[]}

    new_user_dict = get_user_dict()
    new_user_dict[userId] = new_user_dict[max(new_user_dict).keys()] + 1
    self.set_user_dict(new_user_dict)

    return user_obj

  def retrain_add_user(user_obj):
    model = 

  def recommend_items(userId):
  



In [None]:
user_obj

{'id': 6041, 'interactions': []}

In [None]:
userId = max(user_dict.keys())+1
user_d[userId] = user_d[max(user_dict.keys())+1]

In [None]:
dummy_user = np.zeros(shape=(1,1514), dtype=np.int64)

new_user=csr_matrix(dummy_user)

In [None]:
 np.array(new_user.todense())

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
a = np.array(new_user.todense())

In [None]:
np.array(train.todense())

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:

new_interactions = csr_matrix(np.append(np.array(train.todense()), a, axis=0))
new_interactions

<6042x1514 sparse matrix of type '<class 'numpy.float64'>'
	with 218012 stored elements in Compressed Sparse Row format>

In [None]:
model = model.fit_partial(interactions=csr_matrix(a),epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

ValueError: ignored

In [None]:
pickle.dump(model, (open('/content/drive/My Drive/Master/Data/hybrid_label_model.p', 'wb')))

In [None]:
'''

Sample recommendations

'''

In [None]:
items = list(data['item_dict'].keys())
items = np.array(items, dtype=int)

users = list(data['user_dict'].keys())
users = np.array(users, dtype=int)

In [None]:
items = list(data['item_dict'].values())
items = np.array(items, dtype=str)
#map(lambda x: int(x), items)
#items_np = np.array(items, np.int32)
items

array(['Mary Reilly (1996)', 'Vampire in Brooklyn (1995)',
       'Beautiful Girls (1996)', ..., 'Meet the Parents (2000)',
       'Tigerland (2000)', 'Contender, The (2000)'], dtype='<U79')

In [None]:
print(items_np.shape[0])
print(item_features.shape)
print(train.shape)
print(model.user_embeddings.shape)

3254
(3254, 985)
(610, 3254)
(610, 21)


In [None]:
def sample_recommendation(model, items, data, user_ids):


    n_users, n_items = data['train'].shape

    for user_id in user_ids:
        known_positives = items[data['train'].tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items), item_features=item_features)
        top_items = items[np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:5]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:5]:
            print("        %s" % x)
model = evaluation_results['label_freq_warp']['model']
sample_recommendation(model, items, data, [1, 2, 3, 4, 5, 6, 249, 414, 599, 448])

User 1
     Known positives:
        Desperado (1995)
        Like Water for Chocolate (Como agua para chocolate) (1992)
        Maverick (1994)
        Fugitive, The (1993)
        Terminator 2: Judgment Day (1991)
     Recommended:
        Saving Private Ryan (1998)
        Legends of the Fall (1994)
        Thelma & Louise (1991)
        Jerry Maguire (1996)
        Patriot, The (2000)
User 2
     Known positives:
        Dances with Wolves (1990)
        Rock, The (1996)
        Fish Called Wanda, A (1988)
        Monty Python and the Holy Grail (1974)
        Star Wars: Episode V - The Empire Strikes Back (1980)
     Recommended:
        Three Amigos! (1986)
        Butch Cassidy and the Sundance Kid (1969)
        Crocodile Dundee (1986)
        Young Guns (1988)
        Back to the Future (1985)
User 3
     Known positives:
        E.T. the Extra-Terrestrial (1982)
        Jaws (1975)
        Saving Private Ryan (1998)
        Run Lola Run (Lola rennt) (1998)
        Thelma & Lo

In [None]:
tag_labels = list(data['full_vocab'].keys())
tag_labels = np.array(tag_labels, dtype=str)
def get_similar_tags(model, tag_id):
    # Define similarity as the cosine of the angle
    # between the tag latent vectors
    
    # Normalize the vectors to unit length
    tag_embeddings = (model.item_embeddings.T
                      / np.linalg.norm(model.item_embeddings, axis=1)).T
    
    query_embedding = tag_embeddings[tag_id]
    similarity = np.dot(tag_embeddings, query_embedding)
    most_similar = np.argsort(-similarity)[1:4]
    
    return most_similar


for tag in ('mobile_home', 'ballplayer', 'snorkel', 'comic_book'):
    tag_id = tag_labels.tolist().index(tag)
    print('Most similar tags for %s: %s' % (tag_labels[tag_id],
                                            tag_labels[get_similar_tags(model, tag_id)]))

Most similar tags for mobile_home: ['macaw' 'lens_cap' 'hen-of-the-woods']
Most similar tags for ballplayer: ['motor_scooter' 'steam_locomotive' 'stingray']
Most similar tags for snorkel: ['Yorkshire_terrier' 'common_iguana' 'rain_barrel']
Most similar tags for comic_book: ['shower_curtain' 'barbershop' 'hummingbird']
