In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from lightfm import LightFM 
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k
import time
from collections import defaultdict
import csv
import tqdm
import pickle
import random
random.seed(42)

In [2]:
transactions = pd.read_csv('./data/transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])

In [3]:
customers = pd.read_csv('./data/customers.csv')

In [4]:
articles = pd.read_csv('./data/articles.csv', dtype={'article_id': str})

In [5]:
customers['Active'].fillna(0, inplace=True)

In [6]:
customers['FN'].fillna(0, inplace=True)

In [7]:
customers['fashion_news_frequency'].fillna('NONE', inplace=True)

In [8]:
customers.fashion_news_frequency.replace(to_replace={'None': 'NONE'}, inplace=True)

In [9]:
customers.club_member_status.fillna('LEFT CLUB', inplace=True)

In [10]:
def get_user_features(df, columns):
    user_features = set()
    for column in columns:
        for value in df[column].unique():
            user_features.add(f'{column}:{value}')
    return user_features

In [11]:
user_features = get_user_features(customers, ['fashion_news_frequency', 'club_member_status', 'Active'])

In [12]:
def get_item_features(df, columns):
    item_features = set()
    for column in columns:
        for value in df[column].unique():
            item_features.add(f'{column}:{value}')
    return item_features

In [13]:
item_features = get_item_features(articles, articles.columns.to_list())

In [14]:
dataset = Dataset()
dataset.fit(users=customers['customer_id'], 
            items=articles['article_id'],
            user_features=user_features,
            item_features=item_features
            )

In [15]:
train_set = transactions[(transactions.t_dat>='2020-8-15')&(transactions.t_dat<='2020-9-15')]
val_set = transactions[(transactions.t_dat>='2020-9-16')&(transactions.t_dat<='2020-9-22')]

In [16]:
(interactions, weights) = dataset.build_interactions(train_set.iloc[:, 1:3].values)
(val_interactions, val_weights) = dataset.build_interactions(val_set.iloc[:, 1:3].values)

In [17]:
user_features = dataset.build_user_features([(row['customer_id'], 
  [f'fashion_news_frequency:{row["fashion_news_frequency"]}', 
   f'club_member_status:{row["club_member_status"]}', f'Active:{row["Active"]}']) for _, row in customers.iterrows()])

In [18]:
item_features = dataset.build_item_features([(row['article_id'], 
  [ 
    f'product_type_no:{row["product_type_no"]}', 
    f'product_group_name:{row["product_group_name"]}',
    f'colour_group_code:{row["colour_group_code"]}',
    f'perceived_colour_value_id:{row["perceived_colour_value_id"]}',
    f'perceived_colour_master_id:{row["perceived_colour_master_id"]}',
    f'department_no:{row["department_no"]}',
    f'index_code:{row["index_code"]}',
    f'index_group_no:{row["index_group_no"]}',
    f'section_no:{row["section_no"]}',
    f'garment_group_no:{row["garment_group_no"]}',
  ]) for _, row in articles.iterrows()])

In [19]:
model = LightFM(loss='warp', random_state=np.random.RandomState(42))

In [20]:
model.no_components=100
model.item_alpha=1e-6
model.user_alpha=1e-6
model.learning_rate=0.25
model.fit(interactions=interactions, user_features=user_features, item_features=item_features, 
  epochs=1000, verbose=1, num_threads=6)
val_precision = precision_at_k(model, val_interactions, user_features=user_features, item_features=item_features, k=12, num_threads=6).mean()
print(model.get_params(), val_precision)

Epoch:   1%|██▋                                                                                                                                                                                                                                                                          | 10/1000 [00:56<1:32:56,  5.63s/it]


KeyboardInterrupt: 

In [None]:
with open('./models/lightfm_submission_9.pickle', 'wb') as file:
    pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping() 

In [None]:
inv_uid_map = {v:k for k, v in uid_map.items()}
inv_iid_map = {v:k for k, v in iid_map.items()}

In [None]:
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [None]:
test_customer_ids = list(map(lambda x: uid_map[x], sample_submission['customer_id']))

In [None]:
top12=transactions.groupby('article_id')['customer_id'].nunique().sort_values(ascending=False).head(12).index.astype(int)
top12=' '.join(map(str, top12))

In [None]:
train_user_ids = set(customers['customer_id'])

In [None]:
customer_ids = []
preds = []
for usr_ in tqdm.tqdm(test_customer_ids, total=len(test_customer_ids)):
    if usr_ not in train_user_ids:
        m_opt = model.predict(np.array([usr_] * len(iid_map)), np.array(list(iid_map.values())), num_threads=6)
        pred = np.argsort(-m_opt)[:12]
        customer_ids.append(inv_uid_map[usr_])
        preds.append(' '.join([inv_iid_map[p] for p in pred]).strip())
    else:
        customer_ids.append(inv_uid_map[usr_])
        preds.append(top12)

In [None]:
acustomer_ids = np.array(customer_ids).reshape(-1, 1)
apreds = np.array(preds).reshape(-1, 1)

In [None]:
submission = pd.DataFrame(data=np.concatenate((acustomer_ids, apreds), axis=1).reshape(-1, 2), columns=['customer_id', 'prediction'])

In [None]:
submission.to_csv('./submission/lightFM.csv', index=False)