[Implicit library](https://github.com/benfred/implicit)

In [1]:
pip install implicit umap-learn -q


[?25l[K     |▎                               | 10kB 26.6MB/s eta 0:00:01[K     |▋                               | 20kB 34.1MB/s eta 0:00:01[K     |▉                               | 30kB 25.2MB/s eta 0:00:01[K     |█▏                              | 40kB 28.7MB/s eta 0:00:01[K     |█▌                              | 51kB 24.4MB/s eta 0:00:01[K     |█▊                              | 61kB 26.9MB/s eta 0:00:01[K     |██                              | 71kB 19.7MB/s eta 0:00:01[K     |██▍                             | 81kB 20.8MB/s eta 0:00:01[K     |██▋                             | 92kB 19.3MB/s eta 0:00:01[K     |███                             | 102kB 19.7MB/s eta 0:00:01[K     |███▎                            | 112kB 19.7MB/s eta 0:00:01[K     |███▌                            | 122kB 19.7MB/s eta 0:00:01[K     |███▉                            | 133kB 19.7MB/s eta 0:00:01[K     |████▏                           | 143kB 19.7MB/s eta 0:00:01[K     |████▍        

In [2]:
import os
import pickle
import sys

import implicit
from implicit.evaluation import train_test_split, precision_at_k, mean_average_precision_at_k
from implicit.nearest_neighbours import bm25_weight
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from umap import UMAP
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA

os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

# Get and load Data Set
The original datata set is provided by a kaggle competition [instacart market basket analysis](https://www.kaggle.com/c/instacart-market-basket-analysis/overview) 


In [4]:
!wget -O instacart.zip -q https://www.dropbox.com/s/p2w4hql4qjsm0a2/instacart.zip?dl=1 
!unzip -o instacart.zip

Archive:  instacart.zip
  inflating: aisles.csv              
  inflating: departments.csv         
  inflating: order_products__prior.csv  
  inflating: order_products__train.csv  
  inflating: orders.csv              
  inflating: products.csv            
  inflating: sample_submission.csv   


In [5]:
dataset_path = "./" 

orders_path = os.path.join(dataset_path, "orders.csv")
orders_product_prior_path = os.path.join(dataset_path, "order_products__prior.csv")
orders_product_train_path = os.path.join(dataset_path, "order_products__train.csv")
product_path = os.path.join(dataset_path, "products.csv")
aisles_path = os.path.join(dataset_path, "aisles.csv")
departments_path = os.path.join(dataset_path, "departments.csv")


In [6]:
orders = pd.read_csv(orders_path)
products = pd.read_csv(product_path)
orders_product_prior = pd.read_csv(orders_product_prior_path)
orders_product_train = pd.read_csv(orders_product_train_path)

In [7]:
products

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


In [8]:
products.set_index('product_id', inplace=True)

# Clean and transform data set

## Products

In [9]:
products.head()

Unnamed: 0_level_0,product_name,aisle_id,department_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Chocolate Sandwich Cookies,61,19
2,All-Seasons Salt,104,13
3,Robust Golden Unsweetened Oolong Tea,94,7
4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
5,Green Chile Anytime Sauce,5,13


## Orders


In [10]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [11]:
orders.shape

(3421083, 7)

In [12]:
orders.isna().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [13]:
orders["eval_set"].value_counts()

prior    3214874
train     131209
test       75000
Name: eval_set, dtype: int64

In [14]:
#orders.fillna(0,inplace=True)

In [15]:
orders_prior = orders[orders["eval_set"] == "prior"]

In [16]:
orders_train = orders[orders["eval_set"] == "train"]

In [17]:
print(f'Number of unique user in prior: {len(orders_prior["user_id"].unique())}')
print(f'Number of unique user in train: {len(orders_train["user_id"].unique())}')

Number of unique user in prior: 206209
Number of unique user in train: 131209


## Order/Product

In [18]:
orders_product_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


# Feature Engineering
Get full list of user_id / product_id

In [19]:
user_product_prior = (
    orders_product_prior[['order_id', 'product_id']]
    .merge(orders[['user_id', 'order_id']], on='order_id')
    .drop(columns=['order_id'])
)

user_product_train = (
    orders_product_train[['order_id', 'product_id']]
    .merge(orders[['user_id', 'order_id']], on='order_id')
    .drop(columns=['order_id'])
)

In [20]:
user_product_prior.shape

(32434489, 2)

In [21]:
user_product_train.shape

(1384617, 2)

# Modeling

In [22]:
class DataLoader:
    def __init__(self):
        self.users = None
        self.products = None
        self.shape = None
    
    def _sparse_matrices(self, users, products):
        alpha = 1
        values = [alpha] * len(users)
        row_index = users
        col_index = products

        sparse_user_item = \
            csr_matrix((values, (row_index, col_index)), shape=self.shape)
    
        return sparse_user_item

    def fit(self, users, products):
        self.shape = (np.max(user_product_prior['user_id']) + 1,
                      np.max(user_product_prior['product_id']) + 1)
        return self
    
    def transform(self, users, products):
        return self._sparse_matrices(users, products)
    
    def fit_transform(self, users, products):
        return self.fit(users, products).transform(users, products)

    


class RandomModel():
    def __init__(self):
        self.products = None
    
    def train(self, products):
        self.products = np.asarray(products)
        return self

    def recommend(self, n=10):
        return np.random.choice(self.products, size=n, replace=False)

## Random Model (base line)
El modelo mas simple que podemos crear es simplemente recommendar productos al asar, esto nos permite comprar con otros modelos que tan mejor son los resultados en alguna metrica en partcilar para poder dimensionar la mejora de estos algoritmos

In [23]:
random_model = RandomModel().train(products.index.tolist())

In [24]:
random_model.recommend()

array([10115, 48440,  6384, 27111,  8062,  2624, 21140,  1280, 34308,
       32667])

## ALS model
Este modelo es una implementacion de MAtrix Factorization (MF) usando gradient descend para encontrar un espacio latente....

Este modelo va a recomendar una lista de productos para un determinado usuario pero no toma en cuenta el producto que un usario esta mirando, en este contexto es mas util para el home de una pagina y recomendar productos al usuario antes de navegar por la web, al estilo del home de netflix, mercado libre, ebay, etc.

Ademas se pueden recomendar productos similares a uno en especifico que seria la seccion "productos similares a este" que tiene los e-commerce.

A continiacion se muestra un ejemplo de cada uno



```
Implicit Alternating Least Squares

Recommender systems frequently use matrix factorization models to generate
personalized recommendations for users. These models have been found to work 
well on recommending items, and can be easily reused for calculating related 
artists.

Many of the MF models used in recommender systems assume explicit data, where 
the user has rated both things they like and dislike using something like a 5 
star rating scale. They typically work by treating the missing data as an 
unknown, and then minimizing the reconstruction error using SGD.

The data here is implicit though - we can assume that a user listening to an 
artist means they like it, but we don't have the corresponding signal that a 
user doesn't like an artist. Implicit data is usually more plentiful and easier 
to collect than explicit data - and even when you have the user give 5 star 
ratings the vast majority of those ratings are going to be positive only so you 
need to account for implicit behaviour anyways.

This means we can't just treat the missing data as unknowns, instead we have to 
treat a user not listening to an artist as being a signal that the user might 
not like that artist.
```
[reference](http://www.benfrederickson.com/matrix-factorization/)

In [25]:
data_model = DataLoader().fit(users=user_product_prior['user_id'], products=user_product_prior['product_id'])

sparse_user_item_prior = data_model.transform(user_product_prior['user_id'], user_product_prior['product_id'])
sparse_user_item_prior = (bm25_weight(sparse_user_item_prior, B=0.9) * 5).tocsr()
sparse_item_user_prior = sparse_user_item_prior.T.tocsr()


sparse_user_item_prior.shape

(206210, 49689)

(Hice pruebas con diferente valores de factors=[32, 64, 128] y para los 2 primeros las recomendacione tienden a ser mucho menos diversas y se concentran mas en productos mas comprados por los usuarios por lo que recomiendas de pocos department_id

In [26]:
#train, test = train_test_split(sparse_item_user, train_percentage=0.8)

model_user_item = implicit.als.AlternatingLeastSquares(factors=128,
                                                       #regularization=0.1,
                                                       #iterations=20,
                                                       #calculate_training_loss=False
                                                       )
model_user_item.fit(sparse_item_user_prior)

#p_at_k = precision_at_k(model, train_user_items=train, test_user_items=test, K=10)
#m_at_k = mean_average_precision_at_k(model, train, test, K=10)
#print(f'M@k in test: {m_at_k}, P@k in test: {p_at_k}')

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




# Evaluation

In [27]:
users = user_product_train["user_id"].unique()

## Random Model

In [28]:
#random_rec = [random_model.recommend() for k in users]

## ALS Model

In [29]:
#sparse_user_item_train = data_model.transform(user_product_train['user_id'], user_product_train['product_id'])

In [30]:
user_id = 12222

LA siguente lista muestra los productos mas consumidos por este usuario

In [31]:
user_prod_freq = user_product_prior.loc[user_product_prior['user_id'] == user_id, 'product_id'].value_counts().iloc[:20]

products.loc[user_prod_freq.index.to_list(), :]

Unnamed: 0_level_0,product_name,aisle_id,department_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1112,Peach Sparkling Energy Water,64,7
14947,Pure Sparkling Water,115,7
26738,Original Cold Brew Coffee With Milk,26,7
13646,Lemon Hummus,67,20
26165,Electrolyte Enhanced Water,115,7
46969,Organic Bosc Pear,24,4
4920,Seedless Red Grapes,123,4
13113,Organic Apple Juice Boxes,98,7
30391,Organic Cucumber,83,4
32227,House Blend Whole Bean Coffee,26,7


Esta es la recomendacion de productos para un usuario en particular. Lo interesnate de esta recommendacion es que recomienda productos de los 2 departament_id mas frecuentes del usario

In [32]:
recommendation = model_user_item.recommend(user_id, sparse_user_item_prior, N=20)

products.loc[[k[0] for k in recommendation], :]

Unnamed: 0_level_0,product_name,aisle_id,department_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
36772,Bunny Pasta with Yummy Cheese Macaroni & Cheese,4,9
40910,Root Beer,77,7
42110,Organic Macaroni Shells & Real Aged Cheddar,4,9
8277,Apple Honeycrisp Organic,24,4
23044,Cream Top Smooth & Creamy Vanilla Yogurt,120,16
45066,Honeycrisp Apple,24,4
34262,Hint Of Sea Salt Almond Nut Thins,78,19
42265,Organic Baby Carrots,123,4
28204,Organic Fuji Apple,24,4
44471,Free & Clear Unscented Baby Wipes,82,18


### Similar items

In [33]:
item_id = 49686
similar_product = np.asarray(model_user_item.similar_items(item_id), dtype=int)

products.loc[similar_product[:, 0], :]

Unnamed: 0_level_0,product_name,aisle_id,department_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
49686,Artisan Baguette,112,3
35154,Dried Northwest Mix Mushrooms,63,9
35479,Oregon Strawberry Preserves,88,13
2931,Sushi Chef Panko Bread Flakes,66,6
1135,Yams Cut Sweet Potatoes In Orange Pineapple Sauce,81,15
44046,Sweet Potato Kale Quinoa Salad Cup,100,21
3660,Tostones Plantain Patties,116,1
27351,Bowfin Caviar,95,15
28043,Leek & Onion Mediterranean Crackers,78,19
47001,Herbal Tea Wild Berries,94,7


In [34]:
item_id = 18351
item_id = 12252
item_id = 45541
item_id = 25005
similar_product = np.asarray(model_user_item.similar_items(item_id), dtype=int)

products.loc[similar_product[:, 0], :]

Unnamed: 0_level_0,product_name,aisle_id,department_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25005,Organic Green Lentils,63,9
38650,Organic Red Lentils,63,9
26242,Organic Black Lentils,63,9
18479,Organic Low Sodium Vegetable Broth,69,15
42768,Organic Garbanzo Beans,59,15
16145,Organic Cannellini Beans,59,15
23165,Organic Leek,83,4
48357,Green Split Peas,63,9
28785,Organic Tahini,72,13
34126,Organic Italian Parsley Bunch,16,4


### Hit Rate
[recmetrics](https://github.com/statisticianinstilettos/recmetrics/blob/master/recmetrics/metrics.py)

In [35]:
#user_recommendations = [model_user_item.recommend(user_id, sparse_user_item_prior, N=10) for user_id in users]
#user_recommendations = np.asarray(user_recommendations)[:,:,0].astype(int)

#user_item_group = user_product_train.groupby("user_id")["product_id"].apply(lambda x: x.tolist())

#hits = [len(set(user_recommendations[idx, :]).intersection(user_item_group[user])) for idx, user in enumerate(users)]
#hits = np.asarray(hits, dtype=int)

#display(np.mean(hits>0))

#pd.Series(hits, index=users).value_counts()



In [36]:
#score, top, user_weight = model_user_item.explain(user_id, sparse_user_item_prior, 44632)

# Visualization

In [50]:
reducer = UMAP(n_neighbors=10, metric="cosine", n_components=3)
embedding = reducer.fit_transform(model_user_item.item_factors[1:,:])

In [54]:
fig = go.Figure(data=[go.Scatter3d(
        x=reducer.embedding_[:,0],
        y=reducer.embedding_[:,1],
        z=reducer.embedding_[:,2],
        hovertext=products['product_name'],
        mode='markers',
        marker=dict(
            size=3,
            color=products['department_id'],                
            opacity=1,
            line_width=0.05
        )
    )])
# tight layout
fig.update_layout(margin=dict(l=50,r=50,b=50,t=50),width=1500,height=700)
fig.layout.template = 'plotly_white'
fig.show()