In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
from pandas.api.types import CategoricalDtype
import random
import implicit
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.datasets.movielens import get_movielens
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight,tfidf_weight)
import tqdm
import codecs
from tabulate import tabulate

In [2]:
retail_data = pd.read_json('data/cus_user_quality_date_0716.json') # This may take a couple minutes
# df2 = pd.read_csv('data/product_brand_type_color.csv') # This may take a couple minutes

In [3]:
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120442 entries, 0 to 120441
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CUSTOMER_DIM_KEY   120442 non-null  int64  
 1   PRODUCT_DIM_KEY    120442 non-null  int64  
 2   QUANTITY           120441 non-null  float64
 3   PRODUCT_NAME       120435 non-null  object 
 4   COLOR              120442 non-null  object 
 5   PRODUCT_TYPE       120442 non-null  object 
 6   CATEGORY           120442 non-null  object 
 7   BRAND              120442 non-null  object 
 8   RESOLUTION_STATUS  120442 non-null  object 
 9   ORDER_DATE_KEY     120442 non-null  int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 9.2+ MB


In [4]:
cleaned_retail = retail_data.loc[pd.isnull(retail_data.PRODUCT_NAME) == False]
cleaned_retail = cleaned_retail[cleaned_retail['PRODUCT_DIM_KEY'].isin(cleaned_retail[cleaned_retail['ORDER_DATE_KEY'] > 17533].PRODUCT_DIM_KEY)]
cleaned_retail['QUANTITY'] = cleaned_retail['QUANTITY'].fillna(0.0)
cleaned_retail = cleaned_retail[cleaned_retail.RESOLUTION_STATUS.isin(['COMPLETED', 'PENDING'])]
print('Duplicated rows: ' + str(cleaned_retail.duplicated().sum()))
cleaned_retail.drop_duplicates(inplace=True)

Duplicated rows: 280


In [5]:
cleaned_retail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51810 entries, 0 to 120440
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CUSTOMER_DIM_KEY   51810 non-null  int64  
 1   PRODUCT_DIM_KEY    51810 non-null  int64  
 2   QUANTITY           51810 non-null  float64
 3   PRODUCT_NAME       51810 non-null  object 
 4   COLOR              51810 non-null  object 
 5   PRODUCT_TYPE       51810 non-null  object 
 6   CATEGORY           51810 non-null  object 
 7   BRAND              51810 non-null  object 
 8   RESOLUTION_STATUS  51810 non-null  object 
 9   ORDER_DATE_KEY     51810 non-null  int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 4.3+ MB


In [6]:
def create_matrix(cleaned_retail, row_feature, col_feature):
    cleaned_retail[row_feature] = cleaned_retail[row_feature].astype(int) # Convert to int for customer ID
    cleaned_retail = cleaned_retail[[col_feature, 'QUANTITY', row_feature]] # Get rid of unnecessary info
    grouped_cleaned = cleaned_retail.groupby([row_feature, col_feature]).sum().reset_index() # Group together
    grouped_cleaned.QUANTITY.loc[grouped_cleaned.QUANTITY == 0] = 1 # Replace a sum of zero purchases with a one to
    # indicate purchased
    grouped_purchased = grouped_cleaned.query('QUANTITY > 0') # Only get customers where purchase totals were positive
    
    customers = list(np.sort(grouped_purchased[row_feature].unique())) # Get our unique customers
    products = list(np.sort(grouped_purchased[col_feature].unique())) # Get our unique products that were purchased
    quantity = list(grouped_purchased.QUANTITY) # All of our purchases
    #print(products)
    rows = grouped_purchased[row_feature].astype('category', CategoricalDtype(categories = customers)).cat.codes 
    # Get the associated row indices
    cols = grouped_purchased[col_feature].astype('category', CategoricalDtype(categories = products)).cat.codes 
    # Get the associated column indices
    purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))
    return purchases_sparse, customers, products

In [7]:
purchases_sparse, products, customers = create_matrix(cleaned_retail, 'PRODUCT_DIM_KEY', 'CUSTOMER_DIM_KEY')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [42]:
cleaned_retail['CUSTOMER_DIM_KEY'] = cleaned_retail['CUSTOMER_DIM_KEY'].astype(int) # Convert to int for customer ID
cleaned_retail = cleaned_retail[['CUSTOMER_DIM_KEY', 'QUANTITY', 'PRODUCT_DIM_KEY']] # Get rid of unnecessary info
grouped_cleaned = cleaned_retail.groupby(['PRODUCT_DIM_KEY', 'CUSTOMER_DIM_KEY']).sum().reset_index() # Group together
grouped_cleaned.QUANTITY.loc[grouped_cleaned.QUANTITY == 0] = 1 # Replace a sum of zero purchases with a one to
# indicate purchased
grouped_purchased = grouped_cleaned.query('QUANTITY > 0') # Only get customers where purchase totals were positive
    
customers = list(np.sort(grouped_purchased['CUSTOMER_DIM_KEY'].unique())) # Get our unique customers
products = list(np.sort(grouped_purchased['PRODUCT_DIM_KEY'].unique())) # Get our unique products that were purchased
quantity = list(grouped_purchased.QUANTITY) # All of our purchases
#print(products)
rows = grouped_purchased['CUSTOMER_DIM_KEY'].astype('category', CategoricalDtype(categories = customers)).cat.codes 
# Get the associated row indices
cols = grouped_purchased['PRODUCT_DIM_KEY'].astype('category', CategoricalDtype(categories = products)).cat.codes 
# Get the associated column indices

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

In [43]:
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) # Array of product IDs from the ratings matrix

In [51]:
products_arr

array([ 98053,  98125,  98181, ..., 606460, 606598, 608932], dtype=int64)

In [50]:
pop_items = np.array(purchases_sparse.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
pop_items

array([ 1.,  1., 16., ...,  1.,  2.,  1.])

In [61]:
cleaned_retail.loc[cleaned_retail['PRODUCT_DIM_KEY']==98181]

Unnamed: 0,CUSTOMER_DIM_KEY,QUANTITY,PRODUCT_DIM_KEY
21104,50251,2.0,98181
29365,47059,12.0,98181
100070,74134,1.0,98181
119869,83076,1.0,98181


In [41]:
item_id = 98053
purchased_items = cleaned_retail.loc[cleaned_retail.PRODUCT_DIM_KEY.isin([item_id])]
item_ind = np.where(products_arr == item_id)
print(item_ind)

(array([1], dtype=int64),)


In [1]:
cleaned_retail

NameError: name 'cleaned_retail' is not defined

In [27]:
len(pop_items)

10

In [None]:
def get_sim_item(item_id):
    purchased_items = item_lookup.loc[item_lookup.PRODUCT_DIM_KEY.isin([item_id])]
    item_ind = np.where(products_arr == item_id)
    print(purchased_items)
    arr = model.similar_items(item_ind[0][0], N=20)
    rec_keys = [] # start empty list to store items
    descriptions = []
    for index in arr:
        key = products_arr[index[0]]
        rec_keys.append(key)
        descriptions.append(item_lookup.PRODUCT_NAME.loc[item_lookup.PRODUCT_DIM_KEY == key].iloc[0])
        
    scores = [item[1] for item in arr]
    final_frame = pd.DataFrame({'PRODUCT_DIM_KEY': rec_keys, 'PRODUCT_NAME': descriptions, 'SCORE': scores}) # Create a dataframe 
    final_frame[['PRODUCT_DIM_KEY', 'PRODUCT_NAME','SCORE']]
    return final_frame[['PRODUCT_DIM_KEY', 'PRODUCT_NAME','SCORE']] # Switch order of columns around