In [1]:
import pandas as pd
import pickle
import numpy as np
import time
import torch
from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib import font_manager
%matplotlib inline

In [2]:
data_path = 'mathorcup_recom_listwise/data/'

In [3]:
with open(data_path + 'userID2idx.pickle', 'rb') as handle:
    userID2idx = pickle.load(handle)
with open(data_path + 'contentTC2ID.pickle', 'rb') as handle:
    contentTC2ID = pickle.load(handle)
with open(data_path + 'contentID2idx.pickle', 'rb') as handle:
    contentID2idx = pickle.load(handle)
with open(data_path + 'idx2key.pickle', 'rb') as handle:
    idx2key = pickle.load(handle)
with open(data_path + 'contentID2idx_1on1.pickle', 'rb') as handle:
    contentID2idx_1on1 = pickle.load(handle)
with open(data_path + 'contentIdx2ID_1on1.pickle', 'rb') as handle:
    contentIdx2ID_1on1 = pickle.load(handle)
contentScores = torch.tensor(np.load(data_path + 'contentScores.npy'))

In [4]:
def time_distance(src, tgt, denominator=12+1e-6):
    """
    :return: range in [0, 12 / denominator]
    """
    a, b = (src, tgt) if src < tgt else (tgt, src)  # a < b
    return min(b-a, a + 24 - b) / denominator

In [5]:
def MinMaxScaler(data, default=0.1):
    m, M = min(data), max(data)
    if m != M:
        return [(_ - m) / (M - m) for _ in data]
    else:
        return [default for _ in data]

In [6]:
def do(userID):
    idx = userID2idx[userID]
    aaa, bb, cc = idx[0:3], idx[3:5], idx[5:7]
    df = pd.read_csv('data_splitted_by_user_id/{}/{}/{}.csv'.format(aaa, bb, cc), dtype=str)

    clickeds = np.zeros(len(contentTC2ID))
    durations = np.zeros(len(contentTC2ID))
    recommeds = np.zeros(len(contentTC2ID))
    have_recommed_IDs = []
    
    for index, row in df.iterrows():
        multiplier = 1 - time_distance(int(row.time), 20)
        for icd in row.sequence.split(';'):
            i, c, d = icd.split(':')
            have_recommed_IDs.append(i)
            contentIdx = contentID2idx[i]
            if eval(c):
                clickeds[contentIdx] += multiplier
                durations[contentIdx] += multiplier * eval(d)
            recommeds[contentIdx] += multiplier
    have_recommed_IDs = set(have_recommed_IDs)
    #print(have_recommed_IDs)
    avg_clicked_rate = np.divide(clickeds, recommeds, out=np.zeros_like(clickeds), where=recommeds!=0)
    avg_clicked_duration = np.divide(durations, clickeds, out=np.zeros_like(durations), where=clickeds!=0)
    xticklabels = [idx2key[_] for _ in np.arange(len(contentTC2ID))[np.where(avg_clicked_rate != 0)]]
    #print(avg_clicked_rate[avg_clicked_rate > 0], avg_clicked_duration[avg_clicked_duration > 0], xticklabels)
    score1 = MinMaxScaler(avg_clicked_rate[np.where(avg_clicked_rate != 0)])
    score2 = MinMaxScaler(avg_clicked_duration[np.where(avg_clicked_rate != 0)])
    score = torch.tensor([x + y for x, y in zip(score1, score2)])
    values, indices = torch.topk(score, k=min(5, len(score)))
    
    contentTCs = [xticklabels[_] for _ in indices]
    counts = torch.ceil(values * 10 / torch.sum(values)).type(torch.int).tolist()
    k = 0
    terminal_k = 10
    result = []
    while True:
        for contentTC, count in zip(contentTCs, counts):
            # do not recommend same content
            target_content_IDs = set(contentTC2ID[contentTC])
            #print(contentTC, 'target counts:', len(target_content_IDs), '→ ', end='')
            target_content_IDs = target_content_IDs - have_recommed_IDs
            #print(len(target_content_IDs))

            # map to index (range in 2510703)
            target_content_Idxes = [contentID2idx_1on1[_] for _ in target_content_IDs]

            # scores
            target_scores = contentScores[target_content_Idxes]
            top_values, top_indices = torch.topk(target_scores, min(count, len(target_scores)))
            for i, value in enumerate(top_values):
                indice = top_indices[i]
                target_ID = contentIdx2ID_1on1[target_content_Idxes[indice]]
                result.append(target_ID)
                have_recommed_IDs.add(target_ID)
                k += 1
                if k == terminal_k:
                    break
            else:
                continue
            break
        else:
            continue
        return result

In [7]:
recomm_result_df = pd.read_csv(data_path + 'recomm_result.csv', dtype=str, header=None, names=['userID', 'contentID'])

In [8]:
candidateIDs = recomm_result_df['userID']
candidateIDs

0       2263674210
1       2407104270
2       1850876254
3       2208092618
4       1642551254
           ...    
4995    1167815814
4996    2452835534
4997    1517277558
4998    2223184164
4999    1423562960
Name: userID, Length: 5000, dtype: object

In [9]:
userID2idx['1424832354']

'0699427'

In [10]:
print(candidateIDs[2922])
sequence = do(candidateIDs[2922])
';'.join(sequence)

1424832354


'133685078880;133675383025;133678877302;133677361798;133670767481;133672860139;133682790306;133672736156;133645134415;133463032961'

In [16]:
result = []
for i, id in tqdm(enumerate(candidateIDs)):
    result.append(';'.join(do(id)))
    if i == 5:
        break
print(result)

5it [00:00, 15.48it/s]

['506452865;507146538;507040429;506448411;507802432;505966037;509131249;507457950;506466776;507667879', '133675654658;133657843524;133682981897;133682817032;133677387027;133684483942;133674211918;133673025741;133671667695;133682133146', '133675493354;133682019937;133684077954;507760297;508112085;509016369;507245921;506841346;506585559;507903233', '133673251676;133666957709;133678083250;133668617622;133678922321;133675348328;509152890;506929325;133682016014;133684645963', '133686242032;133663752853;133663752851;133680924496;133682743655;133684776889;133677424853;133677424856;133672814290;133665825681', '133671447633;133682877386;133682743655;133664200927;133679383008;133679017560;133667961560;133655999000;133666860087;508363359']





In [12]:
result_df = pd.DataFrame(
    {'candidateIDs': candidateIDs, 'sequence': result}
)
result_df

NameError: name 'result' is not defined

In [57]:
result_df.to_csv(data_path + 'recomm_result_df' + '.csv', index=False)