In [1]:
import os
import tqdm
import seaborn as sns

import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k

%matplotlib inline
SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)



In [2]:
K = 50 #recall number
EPOCHS = 10

# model learning rate
LEARNING_RATE = 0.15
# no of latent factors
NO_COMPONENTS = 20

# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA=1e-6
USER_ALPHA=1e-6

In [3]:
main_dir = ".dataset"
user = pd.read_csv(main_dir+"/user.csv")
item = pd.read_csv(main_dir+"/item.csv", dtype={'article_id': int})
train = pd.read_csv(main_dir+'/transaction.csv', usecols=['t_dat','customer_id','item_id'] ,dtype={'article_id': int}, parse_dates=['t_dat'])

In [4]:
train['article_id'] = train['item_id']

In [5]:
train.head(2)

Unnamed: 0,t_dat,customer_id,item_id,article_id
0,2019-09-01,000f7535bdc611ad136a9f04746d6b1431f50a7f60fbbe...,727880001,727880001
1,2019-09-01,000f7535bdc611ad136a9f04746d6b1431f50a7f60fbbe...,767869001,767869001


In [5]:
item.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [6]:
user.head(2)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...


In [5]:
train['t_dat'] = pd.to_datetime(train['t_dat'])
last_date = train['t_dat'].max()
first_date = train['t_dat'].min()
print(first_date,last_date)

2019-09-01 00:00:00 2020-09-22 00:00:00


In [6]:
# calculate week number
train['week'] = (last_date - pd.to_datetime(train['t_dat'])).dt.days // 7

In [7]:
# set item features
def create_item_features(row):
    features = [
        f"product_code:{row['product_code']}",
        f"section_no:{row['section_no']}",
        f"colour_group_code:{row['colour_group_code']}",
        f"perceived_colour_value_id:{row['perceived_colour_value_id']}",
        f"index_code:{row['index_code']}",
        f"product_type_no:{row['product_type_no']}",
        f"department_no:{row['department_no']}",
        f"garment_group_no:{row['garment_group_no']}",
        f"graphical_appearance_no:{row['graphical_appearance_no']}",
    ]
    return features


item['features'] = item.apply(create_item_features, axis=1)


In [8]:
all_features = set()
item['features'].apply(lambda x: all_features.update(x))

0        None
1        None
2        None
3        None
4        None
         ... 
70936    None
70937    None
70938    None
70939    None
70940    None
Name: features, Length: 70941, dtype: object

In [9]:
# set item features
def create_user_features(row):
    features = [
        f"FN:{row['FN']}",
        f"Active:{row['Active']}",
        f"club_member_status:{row['club_member_status']}",
        f"fashion_news_frequency:{row['fashion_news_frequency']}",
        f"age:{row['age']}",
        f"postal_code:{row['postal_code']}",
      
    ]
    return features

user['features'] = user.apply(create_user_features, axis=1)
all_user_features = set()
user['features'].apply(lambda x: all_user_features.update(x))


0         None
1         None
2         None
3         None
4         None
          ... 
594376    None
594377    None
594378    None
594379    None
594380    None
Name: features, Length: 594381, dtype: object

In [10]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
user['age'] = pd.cut(user['age'], listBin)

In [11]:
dataset = Dataset()
dataset.fit(users=user['customer_id'], 
            user_features=(x for x in all_user_features),
            items=item['article_id'],
            item_features=(x for x in all_features))
num_users, num_topics = dataset.interactions_shape()
print(f'Number of users: {num_users}, Number of topics: {num_topics}.')

Number of users: 594381, Number of topics: 70941.


In [12]:
#build item & user Features
item_features_data = ((row['article_id'], row['features']) for index, row in item.iterrows())
item_features = dataset.build_item_features(item_features_data)
user_features_data = ((row['customer_id'], row['features']) for index, row in user.iterrows())
user_features = dataset.build_user_features(user_features_data)

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

def generate_predictions(dataset, model, val_set, K, batch_size=100):
    """
    Optimized function to generate top K predictions for all users in the validation set.

    Args:
        dataset: The dataset object containing mappings.
        model: The trained recommendation model.
        val_set: The validation set containing customer IDs.
        K: The number of top items to predict.
        batch_size: The number of user IDs to process in each batch.

    Returns:
        DataFrame: The validation set with an additional column for predictions.
    """

    uid_map, _, iid_map, _ = dataset.mapping()
    inv_uid_map = {v: k for k, v in uid_map.items()}
    inv_iid_map = {v: k for k, v in iid_map.items()}
    
    # Preparing item data
    all_item_ids_model = list(iid_map.values())
    item_ids = np.array(all_item_ids_model)
    
    predictions = []

    customer_ids = val_set['customer_id'].unique()
    num_batches = len(customer_ids) // batch_size + (1 if len(customer_ids) % batch_size != 0 else 0)
    
    for i in tqdm(range(num_batches), desc="Predicting"):
        start_index = i * batch_size
        end_index = start_index + batch_size
        batch_customer_ids = customer_ids[start_index:end_index]
        
        batch_user_ids_model = np.array([uid_map[cid] for cid in batch_customer_ids])
        
        # Repeating user IDs for each item
        user_ids = np.repeat(batch_user_ids_model, len(all_item_ids_model))
        item_ids_batch = np.tile(item_ids, len(batch_customer_ids))
        
        batch_predictions = model.predict(
            user_ids=user_ids,
            item_ids=item_ids_batch,
            user_features=user_features,
            item_features=item_features,
            num_threads=16
        )
        
        
        batch_predictions = batch_predictions.reshape(len(batch_customer_ids), len(all_item_ids_model))
        
        # Extracting top K items for each user in the batch
        for user_predictions in batch_predictions:
            top_k_item_indices = np.argsort(-user_predictions)[:K]
            top_k_item_ids_model = item_ids[top_k_item_indices]
            top_k_item_ids_original = [inv_iid_map[item_id] for item_id in top_k_item_ids_model]
            prediction_str = ' '.join(str(item_id) for item_id in top_k_item_ids_original)
            predictions.append(prediction_str)
    
    val_set['prediction'] = predictions
    return val_set



In [13]:
from joblib import dump, load

In [14]:
recall_weeks = [0,1,2,3,4]
week_nums = 23

In [15]:
#train_model
for week in recall_weeks:
    #For the training of the recall model, we can include this week's actual order data, 
    # because our goal is just to find similar items to recommend as candidates.
    train_set = train[(train['week']>=week) & (train['week']<=(week+week_nums))]
    train_set = train_set[['customer_id','article_id']]
    #However, the label is prepared for the ranking model, so it cannot include data from this week or in the future.
    label_set = train[train['week']==week]
    label_set_grouped = label_set.groupby('customer_id')['article_id'].agg(list).reset_index()
    label_set = label_set.drop('article_id', axis=1).merge(label_set_grouped, on='customer_id')
    label_set['label'] = label_set['article_id'].apply(lambda x:list(set(x)))
    label_set = label_set.drop(['item_id','t_dat','article_id'],axis=1).drop_duplicates(subset=['customer_id', 'week'])
    #start to train the recall model and predict the candidates.
    #build interactions for CF algorithm
    (interactions, weights) = dataset.build_interactions(train_set.values)
    #(val_interactions, val_weights) = dataset.build_interactions(val_set.iloc[:, 1:3].values)
    print(interactions.shape)
    model = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEED))
    model.fit(interactions=interactions, item_features=item_features,user_features=user_features,epochs=EPOCHS, verbose=1)  
    dump(model, f'uiCF_recall_{week}.joblib')

    
        
    

(594381, 70941)


Epoch: 100%|██████████| 10/10 [07:21<00:00, 44.14s/it]


(594381, 70941)


Epoch: 100%|██████████| 10/10 [07:28<00:00, 44.83s/it]


(594381, 70941)


Epoch: 100%|██████████| 10/10 [07:29<00:00, 44.92s/it]


(594381, 70941)


Epoch: 100%|██████████| 10/10 [06:52<00:00, 41.29s/it]


(594381, 70941)


Epoch: 100%|██████████| 10/10 [06:20<00:00, 38.05s/it]
