# First version of recommendation model

### Links

Tutorial: [link](https://www.stepbystepdatascience.com/hybrid-recommender-lightfm-python)

Dataset: [instacart-market-basket-analysis](https://www.kaggle.com/datasets/psparks/instacart-market-basket-analysis)

### Prerequisites

Download dataset and place files in folder: ../data/instacart-market-basket-analysis

In [None]:
!pip install --upgrade pip setuptools wheel
!pip install numpy pandas scikit-learn scipy unidecode optuna plotly nbformat pickle
!pip install --no-use-pep517 lightfm # https://github.com/lyst/lightfm/issues/687#issuecomment-1523956355

: 

In [None]:
import scipy
from os import path
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from unidecode import unidecode # to deal with accents
import pickle

import sklearn

sklearn.show_versions()


System:
    python: 3.11.6 (main, Nov  2 2023, 04:39:40) [Clang 14.0.0 (clang-1400.0.29.202)]
executable: /Users/alv.popov/prj/grifon/recommendation/learning/venv/bin/python3.11
   machine: macOS-14.1.2-arm64-arm-64bit

Python dependencies:
      sklearn: 1.4.1.post1
          pip: 24.0
   setuptools: 69.2.0
        numpy: 1.26.4
        scipy: 1.12.0
       Cython: None
       pandas: 2.2.1
   matplotlib: 3.8.3
       joblib: 1.3.2
threadpoolctl: 3.4.0

Built with OpenMP: True

threadpoolctl info:
       user_api: blas
   internal_api: openblas
    num_threads: 10
         prefix: libopenblas
       filepath: /Users/alv.popov/prj/grifon/recommendation/learning/venv/lib/python3.11/site-packages/numpy/.dylibs/libopenblas64_.0.dylib
        version: 0.3.23.dev
threading_layer: pthreads
   architecture: armv8

       user_api: blas
   internal_api: openblas
    num_threads: 10
         prefix: libopenblas
       filepath: /Users/alv.popov/prj/grifon/recommendation/learning/venv/lib/pytho

In [None]:
data_path = '../data'
dataset_path = path.join(data_path, 'instacart-market-basket-analysis')

orders = pd.read_csv(path.join(dataset_path, 'orders.csv'))
products = pd.read_csv(path.join(dataset_path, 'products.csv'))
aisles = pd.read_csv(path.join(dataset_path, 'aisles.csv'))
departments = pd.read_csv(path.join(dataset_path, 'departments.csv'))
order_products = pd.concat([pd.read_csv(path.join(dataset_path, 'order_products__prior.csv')),
                            pd.read_csv(path.join(dataset_path, 'order_products__train.csv'))])

# Check the chronology of the data
orders[(orders["user_id"]==1)].sort_values(["order_number"])

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [None]:
# Remove accents from product and aisle names
products['product_name'] = pd.Series([unidecode(i) for i in products['product_name']])
aisles['aisle'] = pd.Series([unidecode(i) for i in aisles['aisle']])

# Get the orders + items for each data set
def get_orders(subset):
    data = orders[["user_id", "order_id"]][orders["eval_set"]==subset] 
    data = data.merge(order_products[["order_id", "product_id"]], how='inner', on="order_id")
    data = data.merge(products, how='inner', on="product_id")
    data = data.merge(aisles, how='inner', on="aisle_id")
    data = data.merge(departments, how='inner', on="department_id")
    data = data.drop(columns=["aisle_id", "department_id"])
    return data
train_orders = get_orders("prior")
test_orders = get_orders("train")

# List of all products
products = products.merge(aisles, how='inner', on="aisle_id").drop_duplicates()

# Which products are bought the most?
train_orders["product_name"].value_counts()[:10]

product_name
Banana                    472565
Bag of Organic Bananas    379450
Organic Strawberries      264683
Organic Baby Spinach      241921
Organic Hass Avocado      213584
Organic Avocado           176815
Large Lemon               152657
Strawberries              142951
Limes                     140627
Organic Whole Milk        137905
Name: count, dtype: int64

In [None]:
# Get all products purchased by each user
def create_train_data(dataset):
    data = dataset[["user_id", "product_name"]]
    
    # Add a weight column that scales each interaction by how often the user buys it
    data = data.groupby(["user_id", "product_name"], as_index=False).size()
    
    data["weight"] = np.where(data["size"]>=5, 5, data["size"]) # cap it at 5
    data = data[["user_id", "product_name", "weight"]]
    return data
train = create_train_data(train_orders)

# Create our test set
def create_test_data(test, train):
    data = test[["user_id", "product_name"]].drop_duplicates()
    data = data.merge(train["user_id"].drop_duplicates()) # remove users not in training data
    data = data.merge(train["product_name"].drop_duplicates()) # remove items not training data
    return data
test = create_test_data(test_orders, train) 

# Create a test set that excludes repurchases
def create_new_only_test_data(test, train):                 
    data = test.merge(train,  how='left', left_on=['user_id','product_name'], right_on = ['user_id','product_name'])
    data = data[data["weight"].isna()]
    data = data.drop(columns=["weight"])
    return data
test_new = create_new_only_test_data(test, train)

# unique list of user IDs
train_users = train["user_id"].unique()

# unique list of prod IDs
train_items = train["product_name"].unique()

In [None]:
# Create user, item and feature mappings: (user id map, user feature map, item id map, item feature map)
dataset = Dataset() # helper function
dataset.fit(train_users, # creates mappings between userIDs and row indices for LightFM
                 train_items) 
len(dataset.mapping()) # we always get 4x mappings out

4

In [None]:
# We want the user and item mappings (we'll use feature mappings later on)
user_mappings = dataset.mapping()[0]
item_mappings = dataset.mapping()[2]

len(user_mappings), len(item_mappings)

(206209, 49669)

In [None]:
# Have a look at the mappings
list(user_mappings.items())[:5] # first 5 mappings

[(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)]

In [None]:
# Create inverse mappings 
inv_user_mappings = {v:k for k, v in user_mappings.items()}
inv_item_mappings = {v:k for k, v in item_mappings.items()}
list(inv_item_mappings.items())[:5]

[(0, '0% Greek Strained Yogurt'),
 (1, 'Aged White Cheddar Popcorn'),
 (2, 'Bag of Organic Bananas'),
 (3, 'Bartlett Pears'),
 (4, 'Cinnamon Toast Crunch')]

In [None]:
# Create an interactions matrix for each user, item and the weight
train_interactions, train_weights = dataset.build_interactions(train[['user_id', 'product_name', 'weight']].values)
train_interactions, train_weights

(<206209x49669 sparse matrix of type '<class 'numpy.int32'>'
 	with 13307839 stored elements in COOrdinate format>,
 <206209x49669 sparse matrix of type '<class 'numpy.float32'>'
 	with 13307839 stored elements in COOrdinate format>)

In [None]:
# Have a look at the matrices
train_interactions.todense(), train_weights.todense() # weights and interactions are the same if we just use 1s

(matrix([[1, 1, 1, ..., 0, 0, 0],
         [0, 0, 1, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 1, ..., 0, 0, 0],
         [0, 0, 1, ..., 0, 0, 0],
         [0, 0, 1, ..., 0, 0, 0]], dtype=int32),
 matrix([[1., 2., 2., ..., 0., 0., 0.],
         [0., 0., 1., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 5., ..., 0., 0., 0.],
         [0., 0., 5., ..., 0., 0., 0.],
         [0., 0., 5., ..., 0., 0., 0.]], dtype=float32))

In [None]:
# Create Test set - notice that LightFM automatically makes it the same size as Train to preserve integer mappings
test_interactions, test_weights = dataset.build_interactions(test[['user_id', 'product_name']].values)
test_interactions, test_weights

# Create a new-products-purchased-only Test set
test_new_interactions, test_new_weights = dataset.build_interactions(test_new[['user_id', 'product_name']].values)
test_new_interactions, test_new_weights

(<206209x49669 sparse matrix of type '<class 'numpy.int32'>'
 	with 555776 stored elements in COOrdinate format>,
 <206209x49669 sparse matrix of type '<class 'numpy.float32'>'
 	with 555776 stored elements in COOrdinate format>)

In [None]:
model = LightFM(no_components=10,  # the dimensionality of the feature latent embeddings
                			learning_schedule='adagrad', # type of optimiser to use
                			loss='warp', # loss type
                			learning_rate=0.05, # set the initial learning rate
                			item_alpha=0.0, # L2 penalty on item features
                			user_alpha=0.0, # L2 penalty on users features 
                			max_sampled=10, # maximum number of negative samples used during WARP fitting
                			random_state=123)
             
model.fit(train_interactions, # our training data
               epochs = 2,
               verbose=True)

# Measure how well it did in the Test period
for metric in [precision_at_k, recall_at_k]:
    # Get the precision and recall for Train and Test
    for data, name in [(train_interactions, "Train"), (test_interactions, "Test")]:
        print(f"{name} {metric.__name__}: %.2f" % 
              metric(model, data, k=10).mean())

    # What about for just new-to-user purchases?
    print(f"Test new {metric.__name__}: %.2f" % 
          metric(
              model,
              test_new_interactions, 
              train_interactions=train_interactions, # supress previously bought prods from being recommended
              k=10
              ).mean())

Epoch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [03:56<00:00, 11.81s/it]


<lightfm.lightfm.LightFM at 0x16db33490>

In [None]:
# Create all user and item matrix to get predictions for it
n_users, n_items = train_interactions.shape

# Force lightFM to create predictions for all users and all items
scoring_user_ids = np.concatenate([np.full((n_items, ), i) for i in range(n_users)]) # repeat user ID for number of prods
scoring_item_ids = np.concatenate([np.arange(n_items) for i in range(n_users)]) # repeat entire range of item IDs x number of user
scores = model.predict(user_ids = scoring_user_ids, 
                                     item_ids = scoring_item_ids)
scores = scores.reshape(-1, n_items) # get 1 row per user
recommendations = pd.DataFrame(scores)
recommendations.shape

# Have a look at the predicted scores for the first 5 users and first 5 items
recommendations.iloc[:5,:5] 

Unnamed: 0,0,1,2,3,4
0,-0.466873,1.109881,-2.672024,1.289624,-2.252262
1,-2.466969,-0.612912,1.708214,2.160152,-0.74513
2,-1.199134,-0.412123,-1.790381,2.719636,-1.153012
3,-1.354126,-1.335109,0.527995,2.578909,-0.627769
4,-1.447633,0.308471,-0.503776,0.762689,-1.477878


In [None]:
# Load latent representations to try computing predictions manually
item_biases, item_embeddings = model.get_item_representations()
user_biases, user_embeddings = model.get_user_representations()

#Combine item_embeddings with biases for dot product
manual_scores = ((user_embeddings @ item_embeddings.T + item_biases).T + user_biases).T
manual_scores.shape

# They match apart from some tiny rounding!
np.allclose(manual_scores, scores, rtol=0, atol=1e-5)

True

In [None]:
# Top 10 predictions for every user
k=10
top_10 = np.argsort(-scores, axis=1) [::, :k] 

# Get the previous purchases for every user
previous = np.array(train_interactions.todense())

# Get the previous purchases and the top predictions for user 206114
user = user_mappings.get(206114) 

print("Previous purchases:", *[inv_item_mappings.get(key) for key in np.array(range(previous.shape[1]))[previous[user]>0]], sep="\n")
print("Top 10 recommendations:", *sorted(zip([inv_item_mappings.get(key) for key in top_10[user]], range(k)), key = lambda x: x[1]), sep="\n")

Previous purchases:
Cabernet Sauvignon
Petite Sirah
Merlot
Malbec
Essential Red
Sauvignon Blanc, California, 2011
Natural White Organic Wine
Pinot Grigio, California, 2011
Top 10 recommendations:
('Petite Sirah', 0)
('Malbec', 1)
('Pinot Noir', 2)
('Chardonnay', 3)
('Sauvignon Blanc', 4)
('Pinot Noir California', 5)
('Merlot', 6)
('Pinot Grigio', 7)
('Old Vine Zinfandel', 8)
('Sauvignon Blanc, California, 2011', 9)


In [None]:
without_biases = (model.user_embeddings @ model.item_embeddings.T)
without_biases

top_10_without_biases = np.argsort(-without_biases, axis=1) [::, :k] 
print("Top 10 less popular recommendations:", *sorted(zip([inv_item_mappings.get(key) for key in top_10_without_biases[user]], range(k)), key = lambda x: x[1]), sep="\n")

Top 10 less popular recommendations:
('Petite Sirah', 0)
('Organic Zero Sulfites Red Wine', 1)
('Organic Mendocino Cabernet Sauvignon', 2)
('Chenin Blanc', 3)
('Pinot Noir, California 2010', 4)
('Pinot Grigio, California, 2011', 5)
('Sauvignon Blanc, California, 2011', 6)
('Pinot Noir California', 7)
('Natural White Organic Wine', 8)
('Gruner Veltliner', 9)


In [None]:
# Can also set model biases to be 0 with
model.item_biases *= 0.0 # and then can use predict() as normal

# Measure how well it did in the Test period
for metric in [precision_at_k, recall_at_k]:
    # Get the precision and recall for Train and Test
    for data, name in [(train_interactions, "Train"), (test_interactions, "Test ")]:
        print(f"{name} {metric.__name__}: %.2f" % 
              metric(model,
                     data, 
                     k=10).mean())
        
    # What about for just new-to-user purchases?
    print(f"Test new {metric.__name__}: %.3f" % 
          metric(model,
                 test_new_interactions, 
                 train_interactions=train_interactions, # supress previously bought prods from being recommended
                 k=10).mean())

Train precision_at_k: 0.14
Test  precision_at_k: 0.05
Test new precision_at_k: 0.011
Train recall_at_k: 0.58
Test  recall_at_k: 0.38
Test new recall_at_k: 0.072


In [None]:
# Find similar items

def get_similar(model):
    # Extract the user and item representations
    _, item_embeddings  = model.get_item_representations()

    item_to_item = pd.DataFrame(cosine_similarity(item_embeddings))
    item_to_item.index = item_mappings.keys()
    item_to_item.columns = item_mappings.keys()
    item_to_item

get_similar(model)['Banana'].sort_values(ascending=False)[:5]

In [None]:
import optuna

# Define our hyperparameter seearch space
def objective(trial):
    
    # Use LightFMs inbuilt train-test split function to create train and validation subsets
    train, val = random_train_test_split(train_interactions, test_percentage=0.25, random_state=42)
    
    # Define the hyperparameter space
    param = {
        'no_components': trial.suggest_int("no_components", 5, 64),
        "learning_schedule": trial.suggest_categorical("learning_schedule", ["adagrad", "adadelta"]),
        "loss":  trial.suggest_categorical("loss", ["bpr", "warp", "warp-kos"]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 1),
        "item_alpha": trial.suggest_float("item_alpha", 1e-10, 1e-06, log=True),
        "user_alpha": trial.suggest_float("user_alpha", 1e-10, 1e-06, log=True), 
        "max_sampled": trial.suggest_int("max_sampled", 5, 15),
    }
    epochs = trial.suggest_int("epochs", 20, 50)
    
    model = LightFM(**param, random_state=123) 
    model.fit(train, 
              epochs = epochs,
              verbose=True)
    
    val_precision = precision_at_k(model, 
                                   val, 
                                   train_interactions=train,
                                   k=10).mean()

    return val_precision

# Define the study
study = optuna.create_study(direction="maximize")

# Add in our original hyperparmeter values as a starting point for Optuna
study.enqueue_trial(params={"no_components":10, 
                            					"learning_schedule":'adagrad', 
                            					"loss":'warp',
                            					"learning_rate":0.05,
                            					"item_alpha":1e-10, 
                            					"user_alpha":1e-10, 
                            					"max_sampled":10,
                            					"epochs":20})

# Run the optimisation        
study.optimize(objective, n_trials=50)

# Save the best parameters
best_params = study.best_params
for k, v in best_params.items():
    print(k,":",v)

# Which parameters were the most important?
optuna.importance.get_param_importances(study)

In [None]:
# Tidy up epochs as not a parameter to be passed to LightFM() directly
num_epochs = best_params['epochs'] # save best epochs as a separate object
del best_params['epochs'] # then remove it from best_params object

# Train with the best parameters
model = LightFM(**best_params, random_state=123)

model.fit(train_interactions, 
          epochs = num_epochs,
          verbose=True)

# Measure how well it did in the Test period
for metric in [precision_at_k, recall_at_k]:
    # Get the precision and recall for Train and Test
    for data, name in [(train_interactions, "Train"), (test_interactions, "Test ")]:
        print(f"{name} {metric.__name__}: %.2f" % 
              metric(model,
                     data, 
                     k=10).mean())
        
    # What about for just new-to-user purchases?
    print(f"Test new {metric.__name__}: %.3f" % 
          metric(model,
                 test_new_interactions, 
                 train_interactions=train_interactions, # supress previously bought prods from being recommended
                 k=10).mean())

In [None]:
get_similar(model)['Banana'].sort_values(ascending=False)[:5]

In [None]:
def objective(trial):
    
    train, val = random_train_test_split(train_interactions, test_percentage=0.25, random_state=42)
    
    param = {
        'no_components': trial.suggest_int("no_components", 5, 64),
        "learning_schedule": trial.suggest_categorical("learning_schedule", ["adagrad", "adadelta"]),
        "loss":  trial.suggest_categorical("loss", ["warp"]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 1),
        "item_alpha": trial.suggest_float("item_alpha", 1e-10, 1e-06, log=True),
        "user_alpha": trial.suggest_float("user_alpha", 1e-10, 1e-06, log=True),
        "max_sampled": trial.suggest_int("max_sampled", 5, 15),
    }
    epochs = trial.suggest_int("epochs", 20, 50)
    sample_weights = trial.suggest_categorical("sample_weight", ["None", "train_weights"]) # add weights as a parameter  
    
    model = LightFM(**param, random_state=123) 
    model.fit(train, 
              sample_weight=eval(sample_weights),
              epochs = epochs,
              verbose=True)
    
    val_precision = precision_at_k(model, 
                                   val, 
                                   train_interactions=train,
                                   k=10).mean()

    return val_precision

study = optuna.create_study(direction="maximize")

# Add in our original hyperparmeter values as a starting point for Optuna
best_params["epochs"]=num_epochs # manually add epochs
best_params["sample_weight"] ="None" # add in the fact the previous models didn't use weights
best_params["loss"] ="warp" #can't use kos with weights so switch it to warp
study.enqueue_trial(best_params)

study.optimize(objective, n_trials=20)

best_params = study.best_params
for k, v in best_params.items():
    print(k,":",v)

In [None]:
num_epochs = best_params['epochs']
sample_weights=best_params['sample_weight']
    
del best_params['epochs']
del best_params['sample_weight']

# Train with the best parameters
model = LightFM(**best_params, random_state=123)

model.fit(train_interactions, 
          sample_weight=eval(sample_weights),
          epochs = num_epochs,
          verbose=True)

# Measure how well it did in the Test period
for metric in [precision_at_k, recall_at_k]:
    # Get the precision and recall for Train and Test
    for data, name in [(train_interactions, "Train"), (test_interactions, "Test ")]:
        print(f"{name} {metric.__name__}: %.2f" % 
              metric(model,
                     data, 
                     k=10).mean())
        
    # What about for just new-to-user purchases?
    print(f"Test new {metric.__name__}: %.3f" % 
          metric(model,
                 test_new_interactions, 
                 train_interactions=train_interactions, # supress previously bought prods from being recommended
                 k=10).mean())

In [None]:
with open('../artifacts/v0.0.1', 'w') as file:
    pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)