In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
import random
import time
import itertools
from scipy.stats import mstats

#!pip install implicit
from implicit.als import AlternatingLeastSquares
import implicit2
from ratings import ratings
from S3_helper import S3_helper

# Run Unit Tests

In [2]:
!python -m unittest -v test_implicit2

test_rmse (test_implicit2.TestImplicit2) ... ok
test_train_test (test_implicit2.TestImplicit2) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.004s

OK


In [3]:
!python -m unittest -v test_ratings

test_binary (test_ratings.TestRating) ... ok
test_lte (test_ratings.TestRating) ... ok
test_natLog (test_ratings.TestRating) ... ok
test_percStore (test_ratings.TestRating) ... ok
test_sparseMatrix (test_ratings.TestRating) ... ok
test_winsorize (test_ratings.TestRating) ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.045s

OK


# Load Data

In [4]:
access = ''
secret = ''
bucket = ''

In [5]:
cb_s3 = S3_helper(access,secret,bucket)

In [7]:
path = 'Rec_Eng/Product_info.csv'
prod_info = cb_s3.s3_CSVtoDF(path)
prod_info = prod_info[prod_info['STOCK_TYPE_CD'].isin(['S','D'])]
master_sku = list(prod_info.MASTER_SKU_CD.unique())
master_sku = [str(sku) for sku in master_sku]
master_pkg = list(prod_info.MASTER_PKG_CD.unique())

In [10]:
prod_info.head()

Unnamed: 0,ITEM_ID,ITEM_01_CD,ITEM_01_DSC,ITEM_02_CD,ITEM_02_DSC,ITEM_03_CD,ITEM_03_DSC,ITEM_TYPE_CD,ITEM_TYPE_DSC,BRAND_FAMILY_CD,...,STOCK_TYPE_CD,STOCK_TYPE_DSC,CONTAINER_ID_CD,CONTAINER_ID_DSC,COUNTRY_CD,COUNTRY_DSC,COMPONENT_SUB_TYPE_CD,COMPONENT_SUB_TYPE_DSC,UOM_PRIM_CD,UOM_PRIM_DSC
0,1454,10158,CORONITA 7OZ 6PK DEP BT,10158,DEP BT,10158,CORONITA 7OZ 6PK BT DEP,102,FINISHED GOODS - CASE,736,...,S,STOCK END-ITEM,100,BOTTLE,225,MEXICO,28,OTHER GLASS,CA,CASE
1,1457,10159,CORONA LT 6PK DEP BT,10159,DEP BT,10159,CORONA LIGHT 6PK BT DEP,102,FINISHED GOODS - CASE,736,...,S,STOCK END-ITEM,100,BOTTLE,225,MEXICO,28,OTHER GLASS,CA,CASE
2,1458,10160,CORONA LT 12PK DEP BT,10160,DEP BT,10160,CORONA LIGHT 12PK BT DEP,102,FINISHED GOODS - CASE,736,...,S,STOCK END-ITEM,100,BOTTLE,225,MEXICO,28,OTHER GLASS,CA,CASE
3,1459,10161,MODELO NEGRA 6PK DEP BT,10161,DEP BT,10161,NEGRA MODELO BEER 6PK BT DEP,102,FINISHED GOODS - CASE,737,...,S,STOCK END-ITEM,100,BOTTLE,225,MEXICO,28,OTHER GLASS,CA,CASE
4,1463,10163,MODELO ESP 6PK DEP CAN,10163,DEP CAN,10163,MODELO ESPECIAL 6PK CAN DEP,102,FINISHED GOODS - CASE,737,...,S,STOCK END-ITEM,101,CAN(S),225,MEXICO,259,CAN,CA,CASE


In [8]:
path = 'Rec_Eng/IL_OFF.csv'
data = cb_s3.s3_CSVtoDF(path)

In [9]:
data.head()

Unnamed: 0,MASTER_PKG_SKU_CD,TDLINX_STORE_CD,L90_TY_QTY
0,80014014,3941559,0.0
1,80013520,326339,19.5
2,80014007,1402671,1.3333
3,80013970,175965,0.0
4,80013460,770764,0.0


# Data Preprocessing

In [9]:
data['L90_TY_QTY'].describe()

count    617052.000000
mean          2.915046
std          26.884196
min         -10.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        7254.000000
Name: L90_TY_QTY, dtype: float64

In [10]:
data[data['L90_TY_QTY']>0]['L90_TY_QTY'].describe()

count    78270.000000
mean        22.987211
std         72.364276
min          0.500000
25%          3.000000
50%          7.000000
75%         19.000000
max       7254.000000
Name: L90_TY_QTY, dtype: float64

In [11]:
il_off_data = ratings(data,quantCol='L90_TY_QTY')

In [12]:
data.shape

(617052, 3)

In [13]:
il_off_data.lte_quantTrans()
il_off_data.quantity

array([  0.,   0.,  18., ...,   0.,   0.,   0.], dtype=float32)

In [14]:
il_off_data.winsorize_quantTrans()
il_off_data.quantity

array([  0.,   0.,  18., ...,   0.,   0.,   0.], dtype=float32)

In [15]:
il_off_data.natLog_rateTrans()
il_off_data.rating

array([ 0.        ,  0.        ,  2.94443893, ...,  0.        ,
        0.        ,  0.        ], dtype=float32)

In [16]:
il_off_data._quantity

{'Data Column': 'L90_TY_QTY',
 'Transformation': ['Values less than 0 converted to 0',
  'Winsorized nonzeros based on limits 0.1, 0.1']}

In [200]:
#il_off_data.percStore_transform()
#il_off_data.rating

array([ 0.        ,  0.        ,  0.28571429, ...,  0.        ,
        0.        ,  0.        ])

In [17]:
il_sparse = il_off_data.sparse_matrix()

In [15]:
data = {'TDLINX_STORE_CD':['A1234','A1234','B1234','B1234','B1234'],
        'MASTER_PKG_SKU_CD':['1234','5678','5678','1234','9012'],
        'L90_TY_QTY':[10,20,30,40,50]}
df = pd.DataFrame(data)
rate = ratings(df)
sp_matrix = rate.sparse_matrix()

In [8]:

'''
        '1234' '5678' '9012'
A1234     10     20     0
B1234     30     40     50

'''

array([[ 10.,  20.,   0.],
       [ 30.,  40.,  50.]], dtype=float32)

In [18]:
sp_matrix.toarray() == np.array([[ 10.,  20.,   0.],[ 40.,  30.,  50.]])

array([[ True,  True,  True],
       [ True,  True,  True]], dtype=bool)

In [17]:
sum((sp_matrix.toarray() == np.array([[ 10.,  20.,   0.],[ 40.,  30.,  50.]])).ravel())

6

# Grid Search

In [17]:
param = {'alpha':[1,10,100],'factors': [10,20,40,80],'regularization': [0.001,0.1]}
d_test,opt_model,pred = implicit2.grid_search(il_sparse, param,itera=30,n_threads = 3,verbose = False, test_p=0.3)

Optimal Model: {'rmse_ave': 0.5836844473156008, 'alpha': 100, 'regularization': 0.001, 'prediction': array([[  4.11576328e-04,   6.46606654e-04,   9.21890419e-01, ...,
          6.56313362e-04,   8.15401190e-04,   5.29482179e-04],
       [ -5.53098788e-04,  -1.02991216e-03,   9.97110757e-01, ...,
         -1.24742220e-04,   1.99832624e-05,  -5.33905797e-04],
       [  6.28131475e-04,   3.77182098e-04,   9.59235813e-01, ...,
          7.71910403e-04,   5.51667716e-04,   8.87846846e-04],
       ..., 
       [ -7.73551483e-04,  -1.44087105e-03,   8.49740138e-01, ...,
         -1.26856375e-03,  -1.63889029e-03,  -2.43324055e-03],
       [  3.76353335e-03,   5.16063914e-03,   9.99758398e-01, ...,
         -5.45700515e-03,   1.32311616e-02,  -7.38652757e-03],
       [ -2.79404926e-04,   3.58238055e-04,   1.38091402e-01, ...,
          2.28527405e-04,   3.50972382e-04,   3.39045486e-04]]), 'factors': 80}


# Recommendations

In [28]:
tdlinx = '1402413'
scores,prod_cd,prod_dsc,df = implicit2.predict_store(pred, il_sparse,il_off_data, prod_info, tdlinx,10)

In [29]:
df

Unnamed: 0,MASTER_PKG_SKU_CD,MASTER_SKU_DSC,ACTUAL,PREDICT
29,80013971,PACIFICO 6PK BT,1.39,1.00
55,80014011,VICTORIA 12PK BT,2.77,1.00
13,80013461,CORONA FAMILIAR 32OZ BT,2.34,1.00
24,80013518,MODELO ESP 12PK CAN,3.91,1.00
2,80013435,CORONA EX 12PK BT,3.00,1.00
27,80013968,CORONITA 7OZ 6PK BT,1.10,0.99
33,80013978,MODELO ESP 6PK BT,1.79,0.90
23,80013517,MODELO ESP 12PK BT,3.58,0.90
32,80013977,MODELO NEGRA 6PK BT,0.69,0.89
19,80013478,CORONA LT 6PK BT,1.39,0.86


In [30]:
prod_dsc

['CORONA LT 12PK BT',
 'MODELO ESP 6PK CAN',
 'CORONA EX 24PK BT',
 'MODELO ESP 18PK CAN',
 'CORONITA 7OZ 24PK BT',
 'CORONA EX 12PK CAN',
 'VICTORIA 32OZ BT',
 'MODELO ESP 24OZ BT',
 'MODELO ESP 24PK BT',
 'CORONA EX 24OZ CAN LSE']