# Purchase-based Recommendation System
## Based on Moorissa Tjokro tutorial
### Coded by Rebeca Bivar - DB: Armazem Paraíba

### Imports and reading file

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import turicreate as tc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [23]:
#READING A FILE WITH ONLY CLIENTS, PRODUCTS BOUGHT AND QTDE
buyers = pd.read_csv('data_final.csv', sep=';')
buyers.head()


Unnamed: 0,COD_CLIENTE,COD_PRODUTO,NOME_PRODUTO,QUANTIDADE,CANAL
0,5190001,25172,SMARTPHONE LG K9 TV LM-X210BMW PRETO,1,VENDAWEB
1,37578201,25367,TABLET NB729 MINI MS40G BRANCO,1,VENDAWEB
2,93168801,25354,"TELEVISOR SMART 32"" UN32J4290 SAMSUNG",1,VENDAWEB
3,62982901,26584,SMARTPHONE MOTOROLA MOTO G8 PLUS 64GB CEREJA,1,VENDAWEB
4,54509401,21647,DVD D-15 KARAOKE MONDIAL,1,VENDAWEB


##  Data preparation
### Creating dummy table to check if the client has bought a product or not

In [3]:
def create_data_dummy(db):
    data_dummy = db.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

data_dummy = create_data_dummy(buyers)
#data_dummy


### Normalizing purchase frequency of each item across users 

In [4]:
#Dummy for marking whether a customer bought that item or not

df_matrix = pd.pivot_table(buyers, values = 'QUANTIDADE', index = 'COD_CLIENTE', columns = 'COD_PRODUTO')


df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

# create a table for input to the modeling  
data_input = df_matrix_norm.reset_index()
data_input.index.names = ['FREQ_COMPRAS']
data_norm = pd.melt(data_input, id_vars=['COD_CLIENTE'], 
                    value_name='FREQ_COMPRAS')

#print(data_norm.shape)
#data_norm.head()

In [5]:
#Just cleaning useless values 
data_norm = data_norm.dropna()
#data_norm

### Split data into trainning and testing (80/20)

In [7]:

# Returns train and test datasets as scalable dfs
def split_data(data):
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

# Now actually splitting purchase_counts, purchase_dummy and purchase_counts_norm
train_data, test_data = split_data(buyers)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)
print("Train/Test Split done!")

Train/Test Split done!


## Baseline model to compare and evaluate models


In [8]:
# variables to define field names: 
user_id = 'COD_CLIENTE'
item_id = 'COD_PRODUTO'
item_name = 'NOME_PRODUTO'
users_to_recommend = list(buyers[user_id])
n_recommendation = 10 # itens to recommend
n_display = 30 # display the first few rows in an output dataset

# Function for all models using turicreate
def model(train_data, name, user_id, item_id, target, 
          users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id = user_id, 
                                                    item_id = item_id,
                                                    target = target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

### Popularity Model
   Takes the most popular items for recommendation, which are the products with the highest number of sells across customers.

In [None]:
name = 'popularity'
target = 'QUANTIDADE'
popularity = model(train_data, name, user_id, item_id, target, 
                   users_to_recommend, n_recommendation, n_display)
popularity

In [None]:
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, 
                   users_to_recommend, n_recommendation, n_display)

### Using scaled purchase count 

In [None]:
name = 'popularity'
target = 'FREQ_COMPRAS'
pop_norm = model(train_data_norm, name, user_id, item_id, target, 
                 users_to_recommend, n_recommendation, n_display)

### Collaborative Filtering Model

   Recommends items based on how similar clients purchase items. Meaning: if customer 1 and customer 2 bought similar items, for example, 1 bought X, Y, Z and 2 bought X, Y, we would recommend an item Z to customer 2.
    
   - Lets say X and Y have been rated by costumers 1 and 2. 
   - We then create two item-vectors for both items, then we find the **cosine** or **pearson** distance between these vectors. If the **cosine** value is 1, means total similarity, if it is 0, means no similarity.
   - In this case, we will check the similarity between the target item and other items the customer already bought - using the client's purchase count to items already bought by him as weighing factor (some sort of simulated rating). 

### Using purchase count and purchase frequency
### Cosine

In [None]:
name = 'cosine'
target = 'QUANTIDADE'
cos = model(train_data, name, user_id, item_id, target, 
            users_to_recommend, n_recommendation, n_display)

In [None]:
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target,
                  users_to_recommend, n_recommendation, n_display)

In [None]:
name = 'cosine'
target = 'FREQ_COMPRAS'
cos_norm= model(train_data_norm, name, user_id, item_id, target, 
                   users_to_recommend, n_recommendation, n_display)


### Using purchase count and purchase frequency
### Pearson

In [None]:
# PURCHASE COUNT
name = 'pearson'
target = 'QUANTIDADE'
pear = model(train_data, name, user_id, item_id, target,
             users_to_recommend, n_recommendation, n_display)

In [None]:
# PURCHASE DUMMY
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target,
                   users_to_recommend, n_recommendation, n_display)

In [None]:
# PURCHASE FREQUENCY
name = 'pearson'
target = 'FREQ_COMPRAS'
pear_norm = model(train_data_norm, name, user_id, item_id, target,
                  users_to_recommend, n_recommendation, n_display)

## Model Evaluation 

### RMSE - Root Mean Squared Errors

   - Measures the error of predicted values
   - Lesser the RMSE values, better the recommendations
   
### Precision-Recall

   - Recall: Percentage of products that a customer buys that are actually recommended. 
   - Precision: How many itens the customer liked out of the recommended?
   - The idea is to optimze both recall and precision to be close as 1 as possible


In [None]:
# Variables for model evaluation

models_counts = [popularity, cos, pear]
models_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_norm = [pop_norm, cos_norm, pear_norm]

names_counts = ['Popularity Model on Purchase Counts', 
                  'Cosine Similarity on Purchase Counts', 
                  'Pearson Similarity on Purchase Counts']
names_dummy = ['Popularity Model on Purchase Dummy', 
                 'Cosine Similarity on Purchase Dummy', 
                 'Pearson Similarity on Purchase Dummy']
names_norm = ['Popularity Model on Scaled Purchase Counts', 
                'Cosine Similarity on Scaled Purchase Counts', 
                'Pearson Similarity on Scaled Purchase Counts']


eval_counts = tc.recommender.util.compare_models(test_data, models_counts, 
                                                 model_names=names_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_dummy,
                                                model_names=names_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_norm, 
                                               model_names=names_norm)

## Final Model

### Cosine Similarity - Purchase Dummy

In [13]:
# variables to define field names: 
user_id = 'COD_CLIENTE'
item_id = 'COD_PRODUTO'
item_name = 'NOME_PRODUTO'
users_to_recommend = list(buyers[user_id])
n_recommendation = 10 # itens to recommend


# Chosen model
def recom_model(train_data, user_id, item_id, 
          users_to_recommend, n_rec):
    model = tc.item_similarity_recommender.create(tc.SFrame(train_data),
                                                   user_id = user_id,
                                                   item_id = item_id,
                                                   target = 'FREQ_COMPRAS',
                                                   similarity_type='cosine')
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    return recom


recom = recom_model(data_norm, user_id, item_id, users_to_recommend, n_recommendation)
#recom.print_rows(n_display)

In [14]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(2910860, 4)


Unnamed: 0,COD_CLIENTE,COD_PRODUTO,score,rank
0,5190001,18662,0.0,1
1,5190001,18247,0.0,2
2,5190001,17868,0.0,3
3,5190001,17591,0.0,4
4,5190001,17372,0.0,5


## Output Dataframe

    Contains all recommendations

In [24]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model
    #recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['COD_CLIENTE', 'recommendedProducts']].drop_duplicates() \
        .sort_values('COD_CLIENTE').set_index('COD_CLIENTE')
    if print_csv:
        df_output.to_csv('output.csv', sep=';')
        print("An output file can be found with name 'output.csv'")
    return df_output

In [25]:
op = create_output(recom, users_to_recommend, 10, print_csv=True )
print(op.shape)
op.head()

An output file can be found with name 'output.csv'
(217013, 1)


Unnamed: 0_level_0,recommendedProducts
COD_CLIENTE,Unnamed: 1_level_1
000P3D01,15558|26460|19729|26589|24526|24632|26588|1453...
000P3J01,18662|18247|17868|17591|17372|15713|15712|1555...
000P4C01,18662|18247|17868|17591|17372|15713|15712|1555...
000P4M01,18662|18247|17868|17591|17372|15713|15712|1555...
000P4X01,18662|18247|17868|17591|17372|15713|15712|1555...


## Customer Recommendation

### Given a 'Client Code', shows all suggested products

In [28]:
def customer_recomendation(customer_id):
    if customer_id not in op.index:
        print('Customer not found.')
        return customer_id
    return op.loc[customer_id]

In [30]:
customer_recomendation('000P3J01')

recommendedProducts    18662|18247|17868|17591|17372|15713|15712|1555...
Name: 000P3J01, dtype: object