In [1]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np


df = pd.read_csv('/Users/deepak/Desktop/study material/mtech:sem1/sem 2/RecSyst/ta_feng_all_months_merged.csv')

df_sample = df.sample(n=50000, random_state=42)

In [3]:
df.head()

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18


In [4]:

df_sample['TRANSACTION_DT'] = pd.to_datetime(df_sample['TRANSACTION_DT'])

# Sort by CUSTOMER_ID and TRANSACTION_DT
df_sample.sort_values(by=['CUSTOMER_ID', 'TRANSACTION_DT'], inplace=True)


df_sample = df_sample[df_sample['AMOUNT'] > 0]

df_sample = df_sample[['TRANSACTION_DT', 'CUSTOMER_ID', 'PRODUCT_ID', 'AMOUNT']]


df_sample.dropna(inplace=True)

df_sample.reset_index(drop=True, inplace=True)

df_sample.head() 


Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,PRODUCT_ID,AMOUNT
0,2000-11-27,1113,4710905340113,1
1,2001-02-04,1250,4718585391203,2
2,2000-12-04,1359,5010415080073,2
3,2001-01-24,1823,78698703015,1
4,2000-12-02,2189,4711713370378,1


In [5]:

customer_id_mapping = {id: index for index, id in enumerate(df_sample['CUSTOMER_ID'].unique())}
product_id_mapping = {id: index for index, id in enumerate(df_sample['PRODUCT_ID'].unique())}


df_sample['CUSTOMER_INDEX'] = df_sample['CUSTOMER_ID'].map(customer_id_mapping)
df_sample['PRODUCT_INDEX'] = df_sample['PRODUCT_ID'].map(product_id_mapping)


grouped = df_sample.groupby(['CUSTOMER_INDEX', 'TRANSACTION_DT'])
baskets = grouped['PRODUCT_INDEX'].apply(list).reset_index(name='BASKET')

print(baskets.head())


   CUSTOMER_INDEX TRANSACTION_DT  BASKET
0               0     2000-11-27     [0]
1               1     2001-02-04     [1]
2               2     2000-12-04     [2]
3               3     2001-01-24     [3]
4               4     2000-12-02  [4, 5]


In [6]:
import numpy as np

# Define the sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def bfm_score(user_index, item_index, basket_indices, user_factors, item_factors, 
                      global_bias, user_biases, item_biases, gammas):
    """
    Compute the score for a given user, target item, and basket of items using the BFM model.
    """
 
    user_item_interaction = np.dot(user_factors[user_index], item_factors[item_index])
    

    basket_target_interaction = np.sum([
        np.dot(item_factors[item_index], item_factors[basket_item]) for basket_item in basket_indices
    ])
    
    
    basket_basket_interaction = 0  # As gamma3 is 0, this term is not used
    
    
    user_basket_interaction = np.sum([
        np.dot(user_factors[user_index], item_factors[basket_item]) for basket_item in basket_indices
    ])
    

    score = global_bias + user_biases[user_index] + item_biases[item_index] + \
            gammas['gamma1'] * user_item_interaction + \
            gammas['gamma2'] * basket_target_interaction + \
            gammas['gamma3'] * basket_basket_interaction + \
            gammas['gamma4'] * user_basket_interaction
    
    return sigmoid(score)  



In [7]:
from sklearn.model_selection import train_test_split

# Define gammas as per your model's specifics
gammas = {'gamma1': 1, 'gamma2': 1, 'gamma3': 0, 'gamma4': 0}


train_baskets, test_baskets = train_test_split(baskets, test_size=0.2, random_state=42)

# Example dummy latent factors and biases, replace with your actual trained values
num_users = df_sample['CUSTOMER_ID'].nunique()
num_items = df_sample['PRODUCT_ID'].nunique()
num_factors = 8 

np.random.seed(42)
user_factors = np.random.normal(0, 0.1, (num_users, num_factors))
item_factors = np.random.normal(0, 0.1, (num_items, num_factors))
user_biases = np.random.normal(0, 0.1, num_users)
item_biases = np.random.normal(0, 0.1, num_items)
global_bias = np.random.normal()

In [8]:
train_baskets.shape, test_baskets.shape

((29139, 3), (7285, 3))

In [9]:
import numpy as np
import pandas as pd


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def bfm_score_updated(user_index, item_index, basket_indices, user_factors, item_factors, 
                      global_bias, user_biases, item_biases, gammas):
    
    user_item_interaction = np.dot(user_factors[user_index], item_factors[item_index])
    
    
    basket_target_interaction = sum(
        np.dot(item_factors[item_index], item_factors[basket_item]) for basket_item in basket_indices
    )
    
    basket_basket_interaction = sum(
        np.dot(item_factors[item_i], item_factors[item_j])
        for idx_i, item_i in enumerate(basket_indices)
        for item_j in basket_indices[idx_i + 1:]
    )
    
 
    user_basket_interaction = sum(
        np.dot(user_factors[user_index], item_factors[basket_item]) for basket_item in basket_indices
    )

    score = (global_bias + user_biases[user_index] + item_biases[item_index] +
             gammas['gamma1'] * user_item_interaction +
             gammas['gamma2'] * basket_target_interaction +
             gammas['gamma3'] * basket_basket_interaction + 
             gammas['gamma4'] * user_basket_interaction)
    
    return sigmoid(score)  


def calculate_hlu(user_factors, item_factors, user_biases, item_biases, global_bias, test_baskets, gammas, beta=5, C=100):
    hlu_sum = 0
    num_items = item_factors.shape[0]  
    for index, row in test_baskets.iterrows():
        user_idx = row['CUSTOMER_INDEX']
        basket = row['BASKET']  # This should be a list of actual items in the basket
        
       
        predictions = np.array([
            bfm_score_updated(user_idx, item_idx, basket, user_factors, item_factors, global_bias, user_biases, item_biases, gammas)
            for item_idx in range(num_items)
        ])
        
        sorted_indices = np.argsort(predictions)[::-1]  # Descending order
        
      
        item_ranks = {item: rank for rank, item in enumerate(sorted_indices, start=1)}
      
        for item_idx in basket:
            rank = item_ranks.get(item_idx, len(predictions) + 1)  # Default rank if not found
        
            hlu_contribution = C * (0.5 ** ((rank - 1) / (beta - 1)))
            hlu_sum += hlu_contribution
    

    hlu = hlu_sum / test_baskets.shape[0]
    return hlu

hlu_score = calculate_hlu(user_factors, item_factors, user_biases, item_biases, global_bias, test_baskets, gammas)
print(f'HLU: {hlu_score}')


HLU: 0.914109646982235
