In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
import random
import time
import itertools
from scipy.stats import mstats

In [2]:
%%capture
!pip install implicit

In [3]:
%%capture
!pip install boto3

In [4]:
from implicit.als import AlternatingLeastSquares

In [5]:
import implicit2
from ratings import ratings
from S3_helper import S3_helper

# Run Unit Tests

In [6]:
!python -m unittest -v test_implicit2

test_rmse (test_implicit2.TestImplicit2) ... ok
test_train_test (test_implicit2.TestImplicit2) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.002s

OK


In [7]:
!python -m unittest -v test_ratings

test_binary (test_ratings.TestRating) ... ok
test_lte (test_ratings.TestRating) ... ok
test_natLog (test_ratings.TestRating) ... ok
test_percStore (test_ratings.TestRating) ... ok
test_sparseMatrix (test_ratings.TestRating) ... ok
test_winsorize (test_ratings.TestRating) ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.044s

OK


# Load Data

In [8]:
import os
access = os.environ['S3_ACCESS_JON']
secret = os.environ['S3_SECRET_JON']
bucket = 's3-ds-work-test'

In [9]:
cb_s3 = S3_helper(access,secret,bucket)

In [10]:
key = 'Rec_Eng/Product_info.csv'
path = 'Product_info.csv'
cb_s3.pull_file_from_s3(key, path)
prod_info = pd.read_csv(path)
prod_info = prod_info[prod_info['STOCK_TYPE_CD'].isin(['S','D'])]
master_sku = list(prod_info.MASTER_SKU_CD.unique())
master_sku = [str(sku) for sku in master_sku]
master_pkg = list(prod_info.MASTER_PKG_CD.unique())

Grabbed Rec_Eng/Product_info.csv from S3. Local file Product_info.csv is now available.


In [11]:
prod_info.head()

Unnamed: 0,ITEM_ID,ITEM_01_CD,ITEM_01_DSC,ITEM_02_CD,ITEM_02_DSC,ITEM_03_CD,ITEM_03_DSC,ITEM_TYPE_CD,ITEM_TYPE_DSC,BRAND_FAMILY_CD,...,STOCK_TYPE_CD,STOCK_TYPE_DSC,CONTAINER_ID_CD,CONTAINER_ID_DSC,COUNTRY_CD,COUNTRY_DSC,COMPONENT_SUB_TYPE_CD,COMPONENT_SUB_TYPE_DSC,UOM_PRIM_CD,UOM_PRIM_DSC
0,1454,10158,CORONITA 7OZ 6PK DEP BT,10158,DEP BT,10158,CORONITA 7OZ 6PK BT DEP,102,FINISHED GOODS - CASE,736,...,S,STOCK END-ITEM,100,BOTTLE,225,MEXICO,28,OTHER GLASS,CA,CASE
1,1457,10159,CORONA LT 6PK DEP BT,10159,DEP BT,10159,CORONA LIGHT 6PK BT DEP,102,FINISHED GOODS - CASE,736,...,S,STOCK END-ITEM,100,BOTTLE,225,MEXICO,28,OTHER GLASS,CA,CASE
2,1458,10160,CORONA LT 12PK DEP BT,10160,DEP BT,10160,CORONA LIGHT 12PK BT DEP,102,FINISHED GOODS - CASE,736,...,S,STOCK END-ITEM,100,BOTTLE,225,MEXICO,28,OTHER GLASS,CA,CASE
3,1459,10161,MODELO NEGRA 6PK DEP BT,10161,DEP BT,10161,NEGRA MODELO BEER 6PK BT DEP,102,FINISHED GOODS - CASE,737,...,S,STOCK END-ITEM,100,BOTTLE,225,MEXICO,28,OTHER GLASS,CA,CASE
4,1463,10163,MODELO ESP 6PK DEP CAN,10163,DEP CAN,10163,MODELO ESPECIAL 6PK CAN DEP,102,FINISHED GOODS - CASE,737,...,S,STOCK END-ITEM,101,CAN(S),225,MEXICO,259,CAN,CA,CASE


In [12]:
key = 'Rec_Eng/IL_OFF.csv'
path = 'IL_OFF.csv'
cb_s3.pull_file_from_s3(key, path)
data = pd.read_csv(path)

Grabbed Rec_Eng/IL_OFF.csv from S3. Local file IL_OFF.csv is now available.


In [13]:
data.head()

Unnamed: 0,MASTER_PKG_SKU_CD,TDLINX_STORE_CD,L90_TY_QTY
0,80014014,3941559,0.0
1,80013520,326339,19.5
2,80014007,1402671,1.3333
3,80013970,175965,0.0
4,80013460,770764,0.0


# Data Preprocessing

In [14]:
data['L90_TY_QTY'].describe()

count    617052.000000
mean          2.915046
std          26.884196
min         -10.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        7254.000000
Name: L90_TY_QTY, dtype: float64

In [15]:
data[data['L90_TY_QTY']>0]['L90_TY_QTY'].describe()

count    78270.000000
mean        22.987211
std         72.364276
min          0.500000
25%          3.000000
50%          7.000000
75%         19.000000
max       7254.000000
Name: L90_TY_QTY, dtype: float64

In [16]:
il_off_data = ratings(data,quantCol='L90_TY_QTY')

In [17]:
data.shape

(617052, 3)

In [18]:
il_off_data.lte_quantTrans()
il_off_data.quantity

array([  0.,   0.,  18., ...,   0.,   0.,   0.], dtype=float32)

In [19]:
il_off_data.winsorize_quantTrans()
il_off_data.quantity

array([  0.,   0.,  18., ...,   0.,   0.,   0.], dtype=float32)

In [20]:
il_off_data.natLog_rateTrans()
il_off_data.rating

array([ 0.        ,  0.        ,  2.94443893, ...,  0.        ,
        0.        ,  0.        ], dtype=float32)

In [21]:
il_off_data._quantity

{'Data Column': 'L90_TY_QTY',
 'Transformation': ['Values less than 0 converted to 0',
  'Winsorized nonzeros based on limits 0.1, 0.1']}

In [22]:
#il_off_data.percStore_transform()
#il_off_data.rating

In [23]:
il_sparse = il_off_data.sparse_matrix()

# Grid Search

In [24]:
#Setting environment variable due to warning in Implicit package
os.environ['OPENBLAS_NUM_THREADS'] = '1'

In [25]:
#param = {'alpha':[1,10,100],'factors': [10,20,40,80],'regularization': [0.001,0.1]}
param = {'alpha':[1,10,100],'factors': [10],'regularization': [0.1]}
#d_test,opt_model,pred = implicit2.grid_search(il_sparse, param,itera=30,n_threads = 3,verbose = False, test_p=0.3)
d_test,opt_model,pred = implicit2.grid_search(il_sparse, param,itera=1,n_threads = 3,verbose = False, test_p=0.3)

Optimal Model: {'rmse_ave': 0.5942137279472921, 'alpha': 10, 'regularization': 0.1, 'prediction': array([[  3.27079439e-10,  -1.05773364e-10,   1.01925687e+00, ...,
         -3.15305398e-10,   1.30366940e-10,   9.40614233e-10],
       [  4.63549131e-10,  -5.94973020e-10,   1.03170577e+00, ...,
          1.65741733e-10,   4.53241511e-10,   1.45193111e-09],
       [ -7.51582917e-10,   1.16953410e-10,   9.83939712e-01, ...,
         -6.32204412e-10,  -1.32797414e-10,  -2.57587955e-09],
       ..., 
       [ -1.74020062e-09,   3.09078862e-10,   1.00809279e+00, ...,
         -2.00917870e-09,  -3.32747824e-10,  -6.13023697e-09],
       [ -4.39028844e-10,   8.70858774e-12,   9.70815510e-01, ...,
          3.77676353e-11,  -1.63139360e-10,  -1.47899576e-09],
       [ -1.96544038e-20,   3.66125996e-21,   4.52424953e-12, ...,
         -2.81256599e-20,  -5.43522924e-21,  -7.20382761e-20]]), 'factors': 10}


# Recommendations

In [26]:
tdlinx = '1402413'
scores,prod_cd,prod_dsc,df = implicit2.predict_store(pred, il_sparse,il_off_data, prod_info, tdlinx,10)

In [27]:
df

Unnamed: 0,MASTER_PKG_SKU_CD,MASTER_SKU_DSC,ACTUAL,PREDICT
40,80013986,MODELO NEGRA 12PK BT,2.20,1.32
55,80014011,VICTORIA 12PK BT,2.77,1.19
30,80013972,PACIFICO 12PK BT,2.08,1.18
32,80013977,MODELO NEGRA 6PK BT,0.69,1.16
54,80014010,VICTORIA 6PK BT,0.69,1.07
33,80013978,MODELO ESP 6PK BT,1.79,1.03
5,80013444,CORONA EX 24OZ BT,1.39,1.02
23,80013517,MODELO ESP 12PK BT,3.58,1.01
13,80013461,CORONA FAMILIAR 32OZ BT,2.34,1.01
24,80013518,MODELO ESP 12PK CAN,3.91,1.00


In [30]:
prod_dsc

['MODELO ESP 24OZ BT',
 'VICTORIA 32OZ BT',
 'CORONA LT 12PK BT',
 'PACIFICO BALLENA 32OZ BT',
 'CORONITA 7OZ 24PK BT',
 'CORONA EX 24PK BT',
 'MODELO ESP 24PK BT',
 'VICTORIA 12PK CAN',
 nan,
 'CORONA EX 24OZ CAN LSE']

In [31]:
prod_cd

[80013993,
 80056172,
 80013464,
 80014007,
 80014002,
 80013447,
 80014026,
 80024732,
 80032234,
 80014014]