In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
data_dir = '../input/h-and-m-personalized-fashion-recommendations'
img_dir = '../input/h-and-m-personalized-fashion-recommendations/images'

In [None]:
# !wget https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz -O en_vectors_web_lg-2.1.0.tar.gz
# !pip install en_vectors_web_lg-2.1.0.tar.gz
# import en_vectors_web_lg

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import os
import torch
import re
import time
import copy
import math
import pickle

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')

In [None]:
def apk(actual, predicted, k=10):
    if len(predicted)>k: predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted) if a]) # CHANGES: ignore null actual (variable=a)
def calculate_mapk(df):
    res = mapk(
        df['valid_true'].map(lambda x: x.split()), 
        df['prediction'].map(lambda x: x.split()), 
        k=12
    )
    return res

In [None]:
save_dir = '../input/embeddings'
def clean_text(w):
    return re.sub(
            r"([.,'!?\"()*#:;])",
            '',
            w.lower()
            ).replace('-', ' ').replace('/', ' ')

def get_glove_embedding(reviews, data_dir):
    token_file = os.path.join(data_dir,'token_to_ix.pkl')
    glove_file = os.path.join(data_dir,'train_glove.npy')
    if os.path.exists(glove_file) and os.path.exists(token_file):
        print("Loading saved embedding")
        return pickle.load(open(token_file, "rb")), np.load(glove_file)
    all_reviews = {}
    for idx, s in enumerate(reviews):
        all_reviews[idx] = clean_text(s).split()

    from collections import defaultdict
    token_to_ix = defaultdict(int)
    token_to_ix['UNK'] = 1

    spacy_tool = en_vectors_web_lg.load()
    pretrained_emb = []
    pretrained_emb.append(spacy_tool('UNK').vector)
  
    for k, v in all_reviews.items():
        for word in v:
            if word not in token_to_ix:
                token_to_ix[word] = len(token_to_ix)
                pretrained_emb.append(spacy_tool(word).vector)

    pretrained_emb = np.array(pretrained_emb)
    np.save(glove_file, pretrained_emb)
    pickle.dump(token_to_ix, open(token_file, "wb"))
    return token_to_ix, pretrained_emb

def embed_text(x, max_len, token2ix):
    ques_ix = np.zeros(max_len, np.int64)
    x = clean_text(x).split()
    for ix, word in enumerate(x):
        if word in token2ix:
            ques_ix[ix] = token2ix[word]
        else:
            ques_ix[ix] = 1
        if ix + 1 == max_len:
            break
    return ques_ix

def tokenize(reviews):
    token2ix = {'PAD': 0, 'UNK': 1, 'SS' : 2}
    for r in reviews:
        r = clean_text(r).split()
        for word in r:
            if word not in token2ix:
                token2ix[word] = len(token2ix)
    return token2ix

In [None]:
class Ranking(nn.Module):
    def __init__(self, watch_time_feature_size, hidden_size, candidate_size):
        super(Ranking, self).__init__()
        self.fc1 = nn.Linear(watch_time_feature_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)

    def forward(self, src):
        """
        input is (batch_size, n_item, watch_time_feature_size), and output is (batch_size, n_item).
        """
        h = F.relu(self.fc1(src))
        h = F.relu(self.fc2(h))
        out = F.relu(self.fc3(h))
        return out.squeeze(-1)

In [None]:
def train_ranking(model, get_batch_iter, batch_size):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.98), eps=1e-9)
    epochs = 5
    for epoch in range(epochs):
        total_loss = 0
        batch_iter = get_batch_iter()
        for iter_, (mini_x, mini_label) in enumerate(batch_iter):
#         for iter_, (mini_x1, mini_y, min_input_ids, min_token_type_ids, min_attention_mask)\
#           in enumerate(batch_iter):
            out = model(mini_x)  # (batch_size, n_item)
            optimizer.zero_grad()
            loss = nn.MSELoss(reduction='sum')(out, mini_label)  # todo: use sigmoid cross entropy loss
            total_loss += loss.item()
            if iter_ != 0 and (iter_ + 1) % 1000 == 0:
                print(f'epoch: {epoch + 1}, iter: {iter_ + 1}, loss: {total_loss/10}')
                total_loss = 0
            loss.backward()
            optimizer.step()

In [None]:
# tmp = pd.read_csv(os.path.join(data_dir,'transactions_train.csv'), nrows=10000)\
#             .sort_values(by = ['customer_id','article_id'], ascending=True)
# 
# Counter(tmp['article_id']).most_common()[:5]
# set(tmp[tmp['article_id']==685687004]['price']) # same article_id in trasaction can have diff prices
# set(tmp[tmp['article_id']==685687004]['customer_id'])

### Load Data

In [None]:
import datetime
articles = pd.read_csv(os.path.join(data_dir,'articles.csv'))\
            .sort_values(by = ['article_id'], ascending=True)
# customers = pd.read_csv(os.path.join(data_dir,'customers.csv')) # , nrows=100000
# transactions = pd.read_csv(os.path.join(data_dir,'transactions_train.csv')) # , nrows=200000

customers = pd.read_csv('../input/sample-data/customers_sample.csv')
transactions = pd.read_csv('../input/sample-data/transaction_sample.csv')
transactions['t_dat'] = transactions['t_dat'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

In [None]:
# print(max(transactions['t_dat'])) # 2020-09-22 00:00:00
# transactions = 
# print(len(transactions[transactions['t_dat'] > pd.to_datetime('2020-08-22')])) #1155933
get_sample = 0
if get_sample:
    transactions = transactions[transactions['t_dat'] > pd.to_datetime('2020-08-22')]
    cnter = Counter(transactions['customer_id']).most_common()[:2000]
    keep_cids = [i[0] for i in cnter]
    print(keep_cids[:5], cnter[-1])

In [None]:
keep_cids = customers['customer_id'].unique()
# keep_cids = set(transactions['customer_id'].unique())
customers = customers[customers['customer_id'].isin(keep_cids)]
customers = customers.sort_values(by = ['customer_id'], ascending=True)

# keep_cids = keep_cids.intersection(set(customers['customer_id'].unique()))
transactions = transactions[transactions['customer_id'].isin(keep_cids)]
transactions = transactions.sort_values(by = ['customer_id','article_id'], ascending=True)
print(len(keep_cids))

keep_aid = list(transactions['article_id'].unique())
articles = articles[articles['article_id'].isin(keep_aid)].reset_index(drop=True)
len(keep_aid), len(articles)

In [None]:
# customers.to_csv('customers_large_sample.csv', index = False)
# transactions.to_csv('transactions_large_sample.csv', index = False)

In [None]:
len(customers), len(transactions), len(articles)

In [None]:
customers[['FN','Active','club_member_status','fashion_news_frequency']] = customers[['FN','Active', 'club_member_status','fashion_news_frequency']].fillna(0)
print('FN',customers['FN'].unique(), '\nclub_member_status',customers['club_member_status'].unique(),\
        '\nage',customers['age'].unique(),'\nActive',customers['Active'].unique(),\
      '\nfashion_news_frequency', customers['fashion_news_frequency'].unique())

customers['club_member_status'] = customers['club_member_status'].replace('ACTIVE', 1);customers['club_member_status'] = customers['club_member_status'].replace('PRE-CREATE', 2);customers['club_member_status'] = customers['club_member_status'].replace('LEFT CLUB', 3)

customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('NONE', 0);customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('None', 0)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('Regularly', 1);customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('Monthly', 2)

customers[['age','FN','Active','club_member_status','fashion_news_frequency']] = \
        customers[['age','FN','Active','club_member_status','fashion_news_frequency']].astype('float64')
customers.head()

In [None]:
# map_cid_to_aid = {cid:list(transactions[transactions['customer_id']==cid]['article_id']) for cid in keep_cids}
# transactions.groupby('customer_id')['article_id'].apply(list).reset_index()

In [None]:
# customers.head()
print(len(customers['customer_id'].unique()), len(customers)) #1371980
print(len(transactions))
transactions.head()

In [None]:
all_article_desp = []
use_cols = ['prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name',
           'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name',
           'department_name', 'index_name', 'index_group_name', 'section_name',
           'garment_group_name', 'detail_desc']

# articles = articles.dropna(subset=use_cols, how='any').reset_index()
max_len = 0
for c in use_cols:
    arr = [len(str(i).split()) for i in articles[c]]
    length = int(np.percentile(arr,90))
    max_len+=length
    articles[c] = articles[c].apply(lambda x: ' '.join(str(x).split()[:length]))

for i in range(len(articles)):
    text = ''
    for c in use_cols:
        text += articles[c][i] + ' [SEP] '
    all_article_desp.append(text)

In [None]:
token2ix, pretrained_emb = get_glove_embedding(all_article_desp,save_dir)
article_emb = np.array([embed_text(x,max_len,token2ix) for x in all_article_desp])
aid2emb = {articles['article_id'][i]: article_emb[i] for i in range(len(articles))}
aid2text = {articles['article_id'][i]: all_article_desp[i] for i in range(len(articles))}
print(len(articles), len(articles['article_id'].unique()), len(aid2text))#, aid2emb[108775015].shape)
article_emb.shape, pretrained_emb.shape, len(token2ix) # vocab size

In [None]:

# previous model
class CandidateGeneration_basic(nn.Module):
    def __init__(self, embed_item_size, hidden_size):
        super(CandidateGeneration_basic, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(token2ix),
                                      embedding_dim=300)
        self.personal_fc = nn.Linear(1, embed_item_size)
        self.fc1 = nn.Linear(embed_item_size, hidden_size)  # noqa: E226
        self.fc2 = nn.Linear(hidden_size, embed_item_size)
        self.fc3 = nn.Linear(embed_item_size, embed_item_size)
        self.dropout = nn.Dropout(0.5)

#     def forward(self, context_src, personal_src, item_src):
    def forward(self, min_input_ids, min_token_type_ids, min_attention_mask, personal_src, item_src):
        
        personal_h = self.personal_fc(personal_src)  # (batch_size, n_personal, embed_item_size)
        h = torch.cat((context_src, personal_h), 1)  # (batch_size, 1+n_personal, embed_item_size)
        
        h = F.relu(self.fc1(h))
        h = self.dropout(h)
        h = F.relu(self.fc2(h))
        h = self.dropout(h)
        personal_context = F.relu(self.fc3(h))  # (batch_size, 1+n_personal, embed_item_size)
        # personal_context is (batch_size, embed_item_size)'s shape average, take inner prod with item_src
#         print(personal_context)
        out = torch.matmul(personal_context.mean(axis=1), item_src.t())  # (batch_size, n_item) = (batch_size, embed_item_size) * (embed_item_size, n_item)  # noqa: E501
        sigmoid = nn.Sigmoid()
        out = sigmoid(out) # (batch_size, n_item)
        return out

## main -- candidate generation

In [None]:
# transactions.groupby(['customer_id', 'article_id']).count().reset_index()
aid_to_article = {}
for idx, article in enumerate(transactions['article_id'].unique()):
    aid_to_article[idx] = article
article_to_aid = {aid_to_article[k]:k for k in aid_to_article}
# cid_to_article
cid_to_customer = {}
for idx, customer in enumerate(transactions['customer_id'].unique()):
    cid_to_customer[idx] = customer
customer_to_cid = {cid_to_customer[k]:k for k in cid_to_customer}
    
aid_cid_df = transactions[['customer_id','article_id']]
print(len(aid_to_article), len(cid_to_customer))

In [None]:
# tmp = transactions.groupby(['customer_id', 'article_id']).count().reset_index()[['customer_id', 'article_id', 'price']]
# # tmp = tmp.pivot(index='customer_id', columns='article_id', values='price')
# tmp

In [None]:
transactions = transactions.sort_values(by=['t_dat']).reset_index(drop = True)
mindate = min(transactions['t_dat'])
transactions['timespan'] = transactions['t_dat'].apply(lambda x: (x-mindate).days)

transactions, transactions_val = transactions[:int(0.8*len(transactions))], transactions[int(0.8*len(transactions)):].reset_index(drop=True)
print(len(transactions), len(transactions_val))
print(len(customers), len(articles), len(articles['article_id'].unique()))
transactions.head()

In [None]:
list(aid2emb.keys())[0]

In [None]:
class BatchIterator:
    def __init__(self, x, y, batch_size):
        self.batch_size = batch_size
        self.i = 0
        self.x = x
        self.y = y

    def __iter__(self):
        return self

    def __next__(self):
        if self.i * self.batch_size == len(self.y):
            raise StopIteration()
        mini_x = self.x[self.i * self.batch_size: (self.i + 1) * self.batch_size]
        mini_y = self.y[self.i * self.batch_size: (self.i + 1) * self.batch_size]
        self.i += 1
        return mini_x, mini_y
class CandidateBatchIterator(BatchIterator):
    # personal, purchase, candidate_train_label, batch_size, max_len
    #    x1       x2              y                  
    def __init__(self, x1, x2, y, batch_size, max_len):
        self.input_ids,self.token_type_ids,self.attention_mask = [],[],[]
        for i,t in enumerate(x2):
            encoded = tokenizer.encode_plus(text=t,max_length=max_len,padding='max_length',truncation=True)
            self.input_ids.append(encoded['input_ids'])
            self.token_type_ids.append(encoded['token_type_ids'])
            self.attention_mask.append(encoded['attention_mask'])
        self.input_ids,self.token_type_ids,self.attention_mask = torch.tensor(self.input_ids),\
                        torch.tensor(self.token_type_ids),torch.tensor(self.attention_mask)
        
        self.batch_size = batch_size
        self.i = 0
        self.x1 = x1
#         self.x2 = x2
        self.y = y
        self.max_len = max_len

    def __next__(self):
        if self.i * self.batch_size >= len(self.y):
            raise StopIteration()
        mini_x1 = self.x1[self.i * self.batch_size: (self.i + 1) * self.batch_size]
#         mini_x2 = self.x2[self.i * self.batch_size: (self.i + 1) * self.batch_size]
        mini_y = self.y[self.i * self.batch_size: (self.i + 1) * self.batch_size]
    
        min_input_ids = self.input_ids[self.i * self.batch_size: (self.i + 1) * self.batch_size]
        min_token_type_ids = self.token_type_ids[self.i * self.batch_size: (self.i + 1) * self.batch_size]
        min_attention_mask = self.attention_mask[self.i * self.batch_size: (self.i + 1) * self.batch_size]
        self.i += 1
#         return mini_x1, mini_x2, mini_y
        return mini_x1, mini_y, min_input_ids, min_token_type_ids, min_attention_mask


In [None]:
def train_candidate_generation(model, get_batch_iter, item, batch_size):
#     optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, betas=(0.9, 0.98), eps=1e-9)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
    criteon = nn.BCELoss()
    epochs = 5
    for epoch in range(epochs):
        print('epoch', epoch)
        total_loss = 0
        batch_iter = get_batch_iter()
#         for iter_, (mini_personal, mini_watches, mini_label) in enumerate(batch_iter):
# mini_x1, mini_y, min_input_ids, min_token_type_ids, min_attention_mask
        for iter_, (mini_personal, mini_label, min_input_ids, min_token_type_ids, min_attention_mask)\
          in enumerate(batch_iter):
#             out = model(mini_watches, mini_personal, item)
            out = model(min_input_ids, min_token_type_ids, min_attention_mask, mini_personal, item)
    
#             print('mini_label', mini_label)
#             loss = nn.MSELoss(reduction='sum')(out, mini_label)  # todo: use sigmoid cross entropy loss
            loss = criteon(out, mini_label)
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if iter_ != 0 and (iter_ + 1) % 100 == 0:
                print(f'epoch: {epoch + 1}, iter: {iter_ + 1}, loss: {total_loss/10}')
                total_loss = 0
        print('epoch ends')
#             break
    return model

In [None]:
bert_path = 'bert-base-cased'
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, BertModel
class CandidateGeneration(nn.Module):
    def __init__(self, embed_item_size, hidden_size):
        super(CandidateGeneration, self).__init__()
        # self.embedding = nn.Embedding(num_embeddings=len(token2ix),
                                    #   embedding_dim=300)
        self.personal_fc = nn.Linear(1, embed_item_size)
        self.fc1 = nn.Linear(embed_item_size, hidden_size)  # noqa: E226
        self.fc2 = nn.Linear(hidden_size, embed_item_size)
        self.fc3 = nn.Linear(embed_item_size, embed_item_size)
        self.dropout = nn.Dropout(0.5)
    
        self.config = BertConfig.from_pretrained(bert_path)
        self.bert = BertModel.from_pretrained(bert_path)
        for param in self.bert.parameters():
            param.requires_grad=True
        self.fc = nn.Linear(self.config.hidden_size,embed_item_size)

    def forward(self, input_ids,token_type_ids,attention_mask, personal_src, item_src):
#         print(context_src.shape, personal_src.shape, item_src.shape)
#         print('context_src', context_src,'\npersonal_src', personal_src)
        personal_h = self.personal_fc(personal_src)  # (batch_size, n_personal, embed_item_size)
    
        output = self.bert(input_ids,token_type_ids,attention_mask)
        context_src = self.fc(output[1])
        context_src = context_src.reshape((personal_h.shape[0], 1, -1))
        # print(context_src.shape, personal_h.shape)
        h = torch.cat((context_src, personal_h), 1)  # (batch_size, 1+n_personal, embed_item_size)
        
        h = F.relu(self.fc1(h))
        h = self.dropout(h)
        h = F.relu(self.fc2(h))
        h = self.dropout(h)
        personal_context = F.relu(self.fc3(h))  # (batch_size, 1+n_personal, embed_item_size)
        # personal_context is (batch_size, embed_item_size)'s shape average, take inner prod with item_src
#         print(personal_context)
        out = torch.matmul(personal_context.mean(axis=1), item_src.t())  # (batch_size, n_item) = (batch_size, embed_item_size) * (embed_item_size, n_item)  # noqa: E501
        sigmoid = nn.Sigmoid()
        out = sigmoid(out) # (batch_size, n_item)
        return out
# cmodel = CandidateGeneration(embed_item_size, candidate_hidden_size)
# model = train_candidate_generation(cmodel, cbatch_iter, item, batch_size)

In [None]:
# n_item = article_emb.shape[0]  # articles used in training
# n_user = len(transactions['customer_id'].unique())
n_item, n_user = len(aid_to_article), len(cid_to_customer)
batch_size = 4  # split n_user using batch_size
LEARNING_RATE = 0.0005
## Generate article emb from prod_name,  detail_desc, ...

# emb of articles. shape is (n_item, embed_item_size).
item = torch.tensor(article_emb).float() ;item = F.normalize(item)
 # emb dim
embed_item_size = item.shape[1] ; candidate_hidden_size = 64; candidate_size = 12  # number of recommend articles

def get_dataset(customers, transactions):
    personal = np.array(customers[['age','FN','Active','club_member_status']])#  (n_customers, n_features)
    
    # get purchase data: cid-> [[pid_embeding], [e1,e2], []...]
    avg_fill = np.array([np.array(i) for i in aid2emb.values()]).mean(0)
    transactions = customers[['customer_id']].merge(transactions, on = 'customer_id', how = 'left')
    
    purchase = transactions.groupby('customer_id')['article_id'].apply(list).reset_index()
    purchase_true = purchase; purchase = purchase['article_id']
    
#     purchase = np.array([np.mean([aid2emb[i[j]] if i[j] in aid2emb else avg_fill for j in range(len(i))],axis=0) for i in purchase])
    tmp = []
    for i in purchase:
        text = ''
        for j in range(len(i)):
            if i[j] in aid2text:
                text += aid2text[i[j]]
                
        tmp.append(text)
    lengths = [len(i) for i in tmp]
    max_len = 512 # int(np.percentile(lengths,90))
    purchase = tmp
    
    # get labels
    candidate_train_label = torch.zeros((n_user, n_item))
    for i in range(len(transactions)):
        if transactions['article_id'][i] not in article_to_aid: continue
        cid = customer_to_cid[transactions['customer_id'][i]];aid = article_to_aid[transactions['article_id'][i]]; candidate_train_label[cid,aid] = 1
    print(candidate_train_label.shape, torch.sum(candidate_train_label)) # (n_customers, n_item)

    personal = torch.tensor(personal.reshape(personal.shape[0], personal.shape[1],-1)).float()
    personal = torch.nan_to_num(personal)
    personal = F.normalize(personal)
    
#     purchase = torch.tensor(purchase.reshape(purchase.shape[0], -1, purchase.shape[1])).float()
#     purchase = F.normalize(purchase)
#     candidate_train_label = torch.tensor(candidate_train_label).float()
    return personal, purchase, candidate_train_label, purchase_true, max_len
    
personal, purchase, candidate_train_label, purchase_true, max_len = get_dataset(customers, transactions)
personal_val, purchase_val, candidate_val_label, purchase_true_val, max_len_val = get_dataset(customers, transactions_val)
# (n_customers, n_features)
    
cbatch_iter = lambda: CandidateBatchIterator(personal, purchase, candidate_train_label, batch_size, max_len)  # noqa: E731
cbatch_iter_val = lambda: CandidateBatchIterator(personal_val, purchase_val, candidate_val_label, batch_size, max_len)  # noqa: E731
print('personal',personal.shape, 'purchase',len(purchase), 'candidate_train_label',candidate_train_label.shape, batch_size)
print('personal_val',personal_val.shape, 'purchase_val',len(purchase), 'candidate_val_label',candidate_val_label.shape, batch_size)
# print(personal.dtype, purchase.dtype, candidate_train_label.dtype, batch_size)
print('item',item.dtype, item.shape) # (n_item, embed_item_size).[xxx, 63]

In [None]:
# # model
import warnings
warnings.filterwarnings('ignore')
tokenizer = BertTokenizer.from_pretrained(bert_path)
cmodel = CandidateGeneration(embed_item_size, candidate_hidden_size)
model = train_candidate_generation(cmodel, cbatch_iter, item, batch_size)

In [None]:
purchase_true_val.head()

In [None]:
def evaluate_candidate_generation(model, get_batch_iter, item, batch_size, purchase_true_val):
    model.eval()
    batch_iter = get_batch_iter()
    prediction = []
#     for iter_, (mini_personal, mini_watches, mini_label) in enumerate(batch_iter):
    for iter_, (mini_personal, mini_label, min_input_ids, min_token_type_ids, min_attention_mask)\
          in enumerate(batch_iter):
#             out = model(mini_watches, mini_personal, item)
        out = model(min_input_ids, min_token_type_ids, min_attention_mask, mini_personal, item)
    
#         out = model(mini_watches, mini_personal, item)
#         print(out, out.shape)
        indexes = np.argpartition(out.detach().numpy(),-12)
        indexes = [i[-12:] for i in indexes]
        for line in indexes:
            pred = []
            for idx in range(len(line)):
                pred.append(str(aid_to_article[line[idx]]))
            prediction.append(' '.join(pred))
        if iter_ != 0 and (iter_ + 1) % 1000 == 0:
            print(f'epoch: {epoch + 1}, iter: {iter_ + 1}, loss: {total_loss/10}')
            total_loss = 0
#         break
    print('len(prediction)',len(prediction))
    def get_sring_truth(x):
        if str(x[0]) == 'nan': return ''
        return ' '.join(list(map(str, list(map(int,x)))))
    res = purchase_true_val
    res['valid_true'] = res['article_id'].apply(lambda x: get_sring_truth(x));res['prediction'] = prediction;  # res['prediction'][:50] = res['article_id'][:50].apply(lambda x: get_sring_truth(x))
    return res
val_res = evaluate_candidate_generation(model, cbatch_iter_val, item, batch_size, purchase_true_val)

In [None]:
calculate_mapk(val_res)

In [None]:
# torch.save(model.state_dict(), os.path.join('_dir,name'))

In [None]:
sample_submission = pd.read_csv(os.path.join(data_dir,'sample_submission.csv'), nrows = 100)
# sample_submission.head()
print(type(sample_submission['customer_id'][0]), sample_submission['customer_id'][0])
print(type(sample_submission['prediction'][0]), sample_submission['prediction'][0])
print(sample_submission['prediction'][0].split())
sample_submission.head()

In [None]:

#     candidate_train_label = np.array(transactions.groupby(['customer_id', 'article_id']).count().reset_index()\
#         .pivot(index='customer_id', columns='article_id', values='price').fillna(0))

# n_item = article_emb.shape[0]  # articles used in training
# n_user = len(transactions['customer_id'].unique())
# batch_size = 4  # split n_user using batch_size
# ## Generate article emb from prod_name, product_type_name, product_group_name, 
# # graphical_appearance_name, colour_group_name, perceived_colour_value_name, 
# # perceived_colour_master_name, department_name, index_name, garment_group_name,
# # detail_desc
# embed_item_size = 64
# # item = torch.tensor(get_item_vector(n_item))  # emb of articles. shape is (n_item, embed_item_size).
# item = torch.rand((n_item, embed_item_size))
# # print('emb shape (n_item, embed_item_size):', article_emb.shape)
# candidate_hidden_size = 64  
# candidate_size = 12  # number of recommend articles

# ages = torch.randint(0, 100, (n_user, 1, 1), dtype=torch.float)  # (n_user, 1, 1)
# gender = torch.randint(0, 2, (n_user, 1, 1), dtype=torch.float)  # (n_user, 1, 1)
# personal = torch.cat((ages, gender), 1)  # (n_user, n_personal)  # [[age, sex], [age, sex], ...]
# watches = torch.randn(n_user, 1, embed_item_size)  # 視聴した全ての動画の特徴量ベクトルを平均したものと仮定. つまり↓3行のを行ったのと等価.

# avg_fill = np.array([np.array(i) for i in aid2emb.values()]).mean(0)
# purchase = np.array(transactions.groupby('customer_id')['article_id'].apply(list).reset_index()['article_id'])
# # print(len(purchase)) # 104 customers
# purchase = np.array([np.mean([aid2emb[i[j]] if i[j] in aid2emb else avg_fill for j in range(len(i))], axis=0) for i in purchase])
# # print(purchase.shape) # (104, 64)

# candidate_train_label = torch.randint(0, 10, (n_user, n_item), dtype=torch.float)

# cbatch_iter = lambda: CandidateBatchIterator(personal, watches, candidate_train_label, batch_size)  # noqa: E731
# print(personal.shape, watches.shape, candidate_train_label.shape, batch_size)
# # model
# cmodel = CandidateGeneration(embed_item_size, candidate_hidden_size)
# train_candidate_generation(cmodel, cbatch_iter, item, batch_size)

# item.shape

### ranking

In [None]:
# ranking_hidden_size = 248
# purchase_price_feature_size = 1
# candidate_size = 12
# ranking_train_label = torch.tensor(candidate_train_label.reshape(-1)).float()

# print('ranking_train_label',ranking_train_label.shape)

# purchase_price_vector = torch.tensor(
#     np.array(transactions.groupby(['customer_id', 'article_id']).mean().reset_index()\
#         .pivot(index='customer_id', columns='article_id', values='price').fillna(0)).reshape(-1,1)).float()
# print('purchase_price_vector',purchase_price_vector.shape)

# rbatch_iter = lambda: BatchIterator(purchase_price_vector, ranking_train_label, batch_size)  # noqa: E731
# # model
# rmodel = Ranking(purchase_price_feature_size, ranking_hidden_size, candidate_size)
# # train_ranking(rmodel, rbatch_iter, batch_size)

### Prev

In [None]:
# watch_time_feature_size = 124
# ranking_hidden_size = 248
# candidate_size = 10
# # data
# # Assumed to be the average of the feature vectors of the articles viewed among the candidate articles.
# watch_video_vector = torch.randint(0, 10, (n_user * n_item, 1, embed_item_size))  
# # print('n_user', n_user, 'n_item',n_item, watch_video_vector.shape)
# # # Feature vector of target article to predict browsing time
# target_video_vector = torch.randint(0, 10, (n_user * n_item, 1, embed_item_size))  
# video_vector = torch.cat((watch_video_vector, target_video_vector), 1)  # (n_user*n_item, 2, embed_item_size)
# real_impression_matrix = torch.randint(3, 9, (n_user, n_item), dtype=torch.float)  # (n_user, n_item)
# real_watch_time_matrix = torch.empty(n_user, n_item).uniform_(0, 10)  # (n_user, n_item)
# # print('aaa',F.softmax(real_watch_time_matrix / real_impression_matrix, dim=-1).shape)
# ranking_train_label = F.softmax(real_watch_time_matrix / real_impression_matrix, dim=-1).reshape(-1)  # (n_user*n_item)
# print('ranking_train_label',ranking_train_label.shape)

# watch_time_vector = torch.rand((n_user*n_item, watch_time_feature_size))
# print('watch_time_vector',watch_time_vector.shape)

# rbatch_iter = lambda: BatchIterator(watch_time_vector, ranking_train_label, batch_size)  # noqa: E731
# # model
# rmodel = Ranking(watch_time_feature_size, ranking_hidden_size, candidate_size)
# # train_ranking(rmodel, rbatch_iter, batch_size)