In [None]:
import time
import lightgbm as lgb
import Mmetrics
import os
import numpy as np
from read_data import read_pkl,lambdarank,evaluate_train,evaluate

data_path = '_data/MSLR-WEB30K/Fold1/'
dataset = read_pkl(data_path + 'binarized.pkl.npz')


groups = {'tr':{}, 'te':{}, 'va':{}}

groups['tr']['QualityScore2'] = np.zeros_like(dataset.trfm[:,132])
groups['te']['QualityScore2'] = np.zeros_like(dataset.tefm[:,132])
groups['va']['QualityScore2'] = np.zeros_like(dataset.vafm[:,132])
groups['tr']['QualityScore2'][dataset.trfm[:,132]>10] = 1.
groups['te']['QualityScore2'][dataset.tefm[:,132]>10] = 1.
groups['va']['QualityScore2'][dataset.vafm[:,132]>10] = 1.


groups['tr']['PageRank'] = np.zeros_like(dataset.trfm[:,129])
groups['te']['PageRank'] = np.zeros_like(dataset.tefm[:,129])
groups['va']['PageRank'] = np.zeros_like(dataset.vafm[:,129])
groups['tr']['PageRank'][dataset.trfm[:,129]>1000] = 1.
groups['te']['PageRank'][dataset.tefm[:,129]>1000] = 1.
groups['va']['PageRank'][dataset.vafm[:,129]>1000] = 1.
groups['tr']['PageRank'][dataset.trfm[:,129]>10000] = 2.
groups['te']['PageRank'][dataset.tefm[:,129]>10000] = 2.
groups['va']['PageRank'][dataset.vafm[:,129]>10000] = 2.

In [None]:
dataset = read_pkl(data_path + 'binarized.normalized.pkl.npz')

In [None]:
from Mmetrics import *

import LTR
import datautil
import torch

ltrmodel = LTR.MSE_model(layers=[dataset.trfm.shape[1], 256, 256, 1], lr=0.001, optimizer=torch.optim.Adam, dropout=0.1)
ltrmodel.fit(dataset, epochs=10, batch_size=100, verbose=True)
y_pred = ltrmodel.predict(dataset.tefm, dataset.tedlr)
LTRMetrics(dataset.telv, np.diff(dataset.tedlr), y_pred).NDCG(10)

In [None]:
import pandas as pd
import pickle

df = pd.DataFrame({'lv':dataset.telv, 'g':groups['te']['QualityScore2'], 'qid':np.repeat(np.arange(dataset.tedlr.shape[0]-1), np.diff(dataset.tedlr)), 'y_pred':y_pred})
with open('MSLR_qs.df', 'wb') as f:
    pickle.dump(df, f)
    
df = pd.DataFrame({'lv':dataset.telv, 'g':groups['te']['PageRank'], 'qid':np.repeat(np.arange(dataset.tedlr.shape[0]-1), np.diff(dataset.tedlr)), 'y_pred':y_pred})
with open('MSLR_pr.df', 'wb') as f:
    pickle.dump(df, f)

In [None]:
def subsample(ds, percent):
    subsample_size = int((ds.dlr.shape[0]-1) * percent)
    qids = np.random.choice(np.arange(ds.dlr.shape[0]-1), subsample_size)
#     print(qids)
    sds = {'lv':[], 'y_pred':[], 'qid':[], 'g':[], 'dlr':[0]}
    for qid in qids:
        s,e=ds.dlr[qid:qid+2]
#         print(s,e)
        sds['lv'].append(ds.lv[s:e])
        sds['y_pred'].append(ds.y_pred[s:e])
        sds['g'].append(ds.g[s:e])
        sds['qid'].append(ds.qid[s:e])
        sds['dlr'].append(sds['dlr'][-1]+e-s)
    for k in ['lv', 'y_pred', 'g', 'qid']:
        sds[k] = np.concatenate(sds[k])
    sds['dlr'] = np.array(sds['dlr'])
    return sds


def df2ds(df_path):
    with open(df_path, 'rb') as f:
        df = pickle.load(f)
    ds = df.to_dict(orient='list')
    for k in ds:
        ds[k] = np.array(ds[k])
    ds['dlr'] = np.concatenate([np.zeros(1), np.where(np.diff(ds['qid'])==1)[0]+1, np.array([ds['qid'].shape[0]])]).astype(int)
    return type('ltr', (object,), ds)


def dict2ds(df_path):
    with open(df_path, 'rb') as f:
        ds = pickle.load(f)
    return type('ltr', (object,), ds)

In [None]:
def purify(ds):
    lv, g, qids, y_pred, dlr = [], [], [], [], [0]
    groups = np.unique(ds.g)
    for qid in range(ds.dlr.shape[0] - 1):
        s, e = ds.dlr[qid:qid+2]
        if len(np.where(ds.lv[s:e]==4)[0]) == 0:
#             print('no level 4,', qid)
            continue
        z_util = False
        for group in groups:
            if ds.lv[s:e][ds.g[s:e]==group].sum() < 5:
#                 print(qid, group, ds.lv[s:e][ds.g[s:e]==group].sum())
                z_util = True
                break
        if z_util:
            continue
        if e - s > 20:
            argsorted = ds.y_pred[s:e].argsort()[::-1]
            index = list(argsorted[:20])
            z_util = np.where(ds.lv[s:e][argsorted] == 4)[0]
            if argsorted[z_util[0]] not in index:
                index.append(argsorted[z_util[0]])
            for group in groups:
                z_util = np.where((ds.g[s:e][argsorted] == group) & (ds.lv[s:e][argsorted] > 0))[0]
                if len(z_util) > 0 and argsorted[z_util[0]] not in index:
                    index.append(argsorted[z_util[0]])
            index = np.array(index)
        else:
            index = np.arange(e - s)
        
        lv.append(ds.lv[s:e][index])
        g.append(ds.g[s:e][index])
        qids.append(ds.qid[s:e][index])
        y_pred.append(ds.y_pred[s:e][index])
        dlr.append(dlr[-1] + index.shape[0])
        
    purified = {'lv':np.concatenate(lv), 'g':np.concatenate(g), 'qid':np.concatenate(qids), 'y_pred':np.concatenate(y_pred), 'dlr':np.array(dlr)}
    return purified


ds = df2ds('MSLR_qs.df')
purified = purify(ds)
            
with open('p_MSLR_qs.df', 'wb') as f:
    pickle.dump(purified, f)
            

In [None]:
ds = dict2ds('p_MSLR_qs.df')
sds = subsample(ds,0.05)
print(sds['dlr'].shape)
with open('s_MSLR_qs.df', 'wb') as f:
    pickle.dump(sds, f)

In [None]:
ds = dict2ds('p_MSLR_pr.df')
sds = subsample(ds,0.05)
print(sds['dlr'].shape)
with open('s_MSLR_pr.df', 'wb') as f:
    pickle.dump(sds, f)

In [None]:

ds = df2ds('MSLR_pr.df')
purified = purify(ds)
            
with open('p_MSLR_pr.df', 'wb') as f:
    pickle.dump(purified, f)
    
print(purified['dlr'].shape,purified['dlr'][-10:])

In [None]:
with open('MSLR_qs.df', 'wb') as f:
    pickle.dump(df, f)