Load data

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pickle
from collections import defaultdict

from torch.utils.data import Dataset, DataLoader
from Mmetrics import *

import LTR
import datautil
import permutationgraph
import DTR
import EEL
import PPG
import PL

ds2020, _ = datautil.load_data(2020, verbose=True)
ds2019, _ = datautil.load_data(2019, verbose=True)

Fit a LTR model (MSE)

Set `y_pred` using the trained model.

In [None]:
ltrmodel = LTR.MSE_model(layers=[ds2020.trfm.shape[1], 256, 256, 1], lr=0.001, optimizer=torch.optim.Adam, dropout=0.1)
ltrmodel.fit(ds2020, epochs=10, batch_size=100, verbose=True)
y_pred2020 = ltrmodel.predict(ds2020.tefm, ds2020.tedlr)

ltrmodel = LTR.MSE_model(layers=[ds2019.trfm.shape[1], 256, 256, 1], lr=0.001, optimizer=torch.optim.Adam, dropout=0.1)
ltrmodel.fit(ds2019, epochs=10, batch_size=100, verbose=True)
y_pred2019 = ltrmodel.predict(ds2019.tefm, ds2019.tedlr)

---

# Query Learner

Test `QueryLearner` class for one query.

In [None]:
qid = 6
sessions = 1
s, e = ds2020.tedlr[qid:qid+2]
y_pred = y_pred2020[s:e]
sorted_docs = y_pred.argsort()[::-1]
g = ds2020.teg[s:e]


# objective_ins = DTR.DTR(y_pred = y_pred, g = g, dlr = None, exposure = np.array([1./np.log2(2+i) for i in range(1,1000)]), method='query_ratio')

y, g, sorted_docs, dlr = EEL.copy_sessions(y=y_pred, g=g, sorted=sorted_docs, sessions=sessions)

print(sorted_docs)
print(dlr)
objective_ins = EEL.EEL(y_pred = y, g = g, dlr = dlr, grade_levels=5, exposure = np.array([1./np.log2(2+i) for i in range(1,1000)]))

n = y.shape[0]
# learner = PPG.Learner(0.5 * np.triu(np.ones((n,n)), 1), samples_cnt=16, 
#                         objective_ins=objective_ins, sorted_docs=sorted_docs, dlr=dlr, intra=g, inter=np.repeat(dlr[:-1], np.diff(dlr)))
# learner = permutationgraph.QueryLearner(objective_ins, sorted_docs = sorted_docs, intra = g)
learner = PL.Learner(logits=y, samples_cnt=256, objective_ins=objective_ins)
learner.fit(50, 0.1, True)

In [None]:
learner = PPG.Learner(0.5 * np.triu(np.ones((n,n)), 1), samples_cnt=256, 
                        objective_ins=objective_ins, sorted_docs=sorted_docs, dlr=dlr, intra=g, inter=np.repeat(dlr[:-1], np.diff(dlr)))
learner.fit(50, 0.1, True)

Test `QueryLearner` class for all queries, using `learn_all_query` function.

In [None]:

def learn_all_query(y_pred, g, dlr, exposure, epochs, lr, learner_cls, objective, objective_args=None):
    y_rerank = []
    sorted_docs = []
    min_vals = []
    
    # for qid in trange(dlr.shape[0] - 1, leave=False):
    for qid in range(dlr.shape[0] - 1):
        s, e = dlr[qid:qid+2]
        if objective == 'DTR':
            objective_ins = DTR.DTR(y_pred = y_pred[s:e], g = g[s:e], dlr = None, exposure = exposure, method='query_ratio')
        elif objective == 'EEL':
            objective_ins = EEL.EEL(y_pred = y_pred[s:e], g = g[s:e], dlr = np.array([0,e-s]), exposure = exposure, **objective_args)

        learner = learner_cls(objective_ins, sorted_docs = y_pred[s:e].argsort()[::-1], intra = g[s:e])
        vals = learner.fit(epochs, lr, verbose=False)

        scores = np.arange(len(learner.sorted_docs), 0, -1)
        y_rerank.append(scores[learner.sorted_docs])
        sorted_docs.append(learner.sorted_docs)
        
        vals = np.array(vals)
        min_vals.append(vals.min())

    # print(ndcg_dtr(exposure, lv, np.concatenate(y_rerank), dlr, g, query_counts))
    return np.concatenate(y_rerank), np.concatenate(sorted_docs), np.array(min_vals)



**TREC 2020**

In [None]:
objective='EEL'
objective_args = {'grade_levels':2}

exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.tedlr).max()+2)])
y_rerank2020, sorted2020, min_vals = learn_all_query(  y_pred2020, ds2020.teg, ds2020.tedlr, 
                                            exposure = exposure2020,
                                            epochs=20, lr=0.3, 
                                            learner_cls=permutationgraph.QueryLearner,
                                            objective=objective,
                                            objective_args = objective_args)
print(f'{len(min_vals[min_vals>0])} valid queries. --> average: {min_vals[min_vals>0].mean()}')

if objective == 'DTR':
    print(DTR.ndcg_dtr(exposure2020, ds2020.telv, y_rerank2020, ds2020.tedlr, ds2020.teg, ds2020.query_seq))
elif objective == 'EEL':
    eel = EEL.EEL(y_pred=ds2020.telv, g=ds2020.teg, dlr=ds2020.tedlr, exposure=exposure2020, grade_levels=2)
    print(eel.eval(sorted2020))

Working with true labels instead of LTR output:

In [None]:
objective='EEL'
objective_args = {'grade_levels':2}

exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.tedlr).max()+2)])
y_rerank2020, sorted2020, min_vals = learn_all_query(  ds2020.telv, ds2020.teg, ds2020.tedlr, 
                                            exposure = exposure2020,
                                            epochs=5, lr=0.3, 
                                            learner_cls=permutationgraph.QueryLearner,
                                            objective=objective,
                                            objective_args = objective_args)
print(f'{len(min_vals[min_vals>0])} valid queries. --> average: {min_vals[min_vals>0].mean()}')

if objective == 'DTR':
    print(DTR.ndcg_dtr(exposure2020, ds2020.telv, y_rerank2020, ds2020.tedlr, ds2020.teg, ds2020.query_seq))
elif objective == 'EEL':
    eel = EEL.EEL(y_pred=ds2020.telv, g=ds2020.teg, dlr=ds2020.tedlr, exposure=exposure2020, grade_levels=2)
    print(eel.eval(sorted2020))

In [None]:
def test(epochs, lr):
    exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.tedlr).max()+2)])
    y_rerank2020, sorted2020, min_vals = learn_all_query(  ds2020.telv, ds2020.teg, ds2020.tedlr, 
                                            exposure = exposure2020,
                                            epochs=epochs, lr=lr, 
                                            learner_cls=permutationgraph.QueryLearner,
                                            objective=objective,
                                            objective_args = {'grade_levels':2})
    eel = EEL.EEL(y_pred=ds2020.telv, g=ds2020.teg, dlr=ds2020.tedlr, exposure=exposure2020, grade_levels=2)
    return eel.eval(sorted2020)

for epochs in [2,5,10,20,40,100]:
    print(epochs)
    for lr in [0,0.05,0.1,0.3]:
        eel = []
        for i in range(8):
            eel.append(test(5,lr))
        eel = np.array(eel)
        print([lr, eel.mean(), eel.std()])

In [None]:

for lr in [0,0.1,0.3,0.5]:
    eel = []
    for i in range(8):
        eel.append(test(5,lr))
    eel = np.array(eel)
    print([lr, eel.mean(), eel.std()])

**TREC 2019**

In [None]:
exposure2019 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2019.tedlr).max()+2)])
y_rerank2019, min_vals = learn_all_query(  y_pred2019, ds2019.teg, ds2019.tedlr, 
                                            exposure=exposure2019,
                                            epochs=50, lr=0.3, 
                                            learner_cls=permutationgraph.QueryLearner)
print(f'{len(min_vals[min_vals>0])} valid queries.')
DTR.ndcg_dtr(exposure2019, ds2019.telv, y_rerank2019, ds2019.tedlr, ds2019.teg, ds2019.query_seq)

Some checks for weirdness!

In [None]:
print(DTR.ndcg_dtr(exposure2019, ds2019.telv, -y_rerank2019, ds2019.tedlr, ds2019.teg, ds2019.query_seq))
print(DTR.ndcg_dtr(exposure2019, ds2019.telv, y_pred2019, ds2019.tedlr, ds2019.teg, ds2019.query_seq))
print(DTR.ndcg_dtr(exposure2019, ds2019.telv, -y_pred2019, ds2019.tedlr, ds2019.teg, ds2019.query_seq))

---


# Batch Learner

Test `BatchLearner` for a set of queries.

In [None]:
def select_queries(y, g, dlr, qids):
    ys, gs, dlrs = [], [], [0]
    for qid in qids:
        s, e = dlr[qid:qid+2]
        ys.append(y[s:e])
        gs.append(g[s:e])
        dlrs.append(e-s)
    return np.concatenate(ys), np.concatenate(gs), np.cumsum(dlrs)

qids = [197,  64]
y_pred, gs, dlrs = select_queries(ds2020.telv, ds2020.teg, ds2020.tedlr, qids)
objective_ins = DTR.DTR(y_pred = y_pred, g = gs, dlr = dlrs, exposure = np.array([1./np.log2(2+i) for i in range(1,1000)]), method='batch_ratio')

ss = []
for qid in range(dlrs.shape[0] - 1):
    s,e = dlrs[qid:qid+2]
    ss.append(y_pred[s:e].argsort()[::-1])
sorted_docs = np.concatenate(ss)
batch_numbers = np.repeat(dlrs[:-1], np.diff(dlrs))
learner = permutationgraph.BatchLearner(objective_ins=objective_ins, sorted_docs=sorted_docs, intra=gs, inter=batch_numbers)
learner.fit(50, 0.3, True)

Test `BatchLearner` class for all queries, using `learn_all_batch` function.

In [None]:

def get_group_counts(g, dlr):
    groups = np.unique(g)
    gcnt = [[] for _ in range(len(groups))]
    for qid in range(dlr.shape[0] - 1):
        s, e = dlr[qid:qid+2]
        for i, group in enumerate(groups):
            gcnt[i].append(len(np.where(g[s:e] == group)[0]))
    for i, group in enumerate(groups):
            gcnt[i] = np.array(gcnt[i])
    return groups, gcnt


def update_y(y, sorted_docs, dlr, qids):
    pos = 0
    for qid in qids:
        s, e = dlr[qid:qid+2]
        scores = np.arange(e-s, 0, -1)
        y[s:e] = scores[sorted_docs[pos:pos+e-s]]
        pos += e-s
def learn_all_batch(y_pred, g, dlr, exposure, epochs, lr, learner_cls):
    y_rerank = []
    min_vals = []
    
    groups, gcnt = get_group_counts(g, dlr)
    for i, _ in enumerate(groups):
            gcnt[i] = gcnt[i].argsort()[:, None]
    gcnt = np.concatenate(gcnt, axis=1)
    
    for qid in range(dlr.shape[0] - 1):
        s, e = dlr[qid:qid+2]

        sorted_docs = y_pred[s:e].argsort()[::-1]

        scores = np.arange(len(sorted_docs), 0, -1)
        y_rerank.append(scores[sorted_docs])
    y_rerank = np.concatenate(y_rerank)

    # for bid in trange(dlr.shape[0] - 1, leave=False):
    for bid in range(dlr.shape[0] - 1):
        qids = gcnt[bid, :]
        # print(qids)

        ys, gs, dlrs = select_queries(y_rerank, g, dlr, qids)


        objective_ins = DTR.DTR(y_pred = ys, g = gs, dlr = dlrs, exposure = exposure, method='batch_ratio')

        ss = []
        for qid in range(dlrs.shape[0] - 1):
            s,e = dlrs[qid:qid+2]
            ss.append(ys[s:e].argsort()[::-1])
        sorted_docs = np.concatenate(ss)
        batch_numbers = np.repeat(dlrs[:-1], np.diff(dlrs))
        learner = learner_cls(objective_ins=objective_ins, sorted_docs=sorted_docs, intra=gs, inter=batch_numbers)

        vals = learner.fit(epochs, lr, verbose=False)

        update_y(y_rerank, learner.sorted_docs, dlr, qids)
        
        vals = np.array(vals)
        min_vals.append(vals.min())

        
    return y_rerank, np.array(min_vals)
   

**TREC 2020**

In [None]:

exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.tedlr).max()+2)])
y_rerank2020, min_vals = learn_all_batch(  y_pred2020, ds2020.teg, ds2020.tedlr, 
                                            exposure = exposure2020,
                                            epochs=200, lr=0.3, 
                                            learner_cls=permutationgraph.BatchLearner)
print(f'{len(min_vals[min_vals>0])} valid queries. --> average: {min_vals[min_vals>0].mean()}')

DTR.ndcg_dtr(exposure2020, ds2020.telv, y_rerank2020, ds2020.tedlr, ds2020.teg, ds2020.query_seq)


Testing with true labels:

In [None]:

exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.tedlr).max()+2)])
y_rerank2020, min_vals = learn_all_batch(  ds2020.telv, ds2020.teg, ds2020.tedlr, 
                                            exposure = exposure2020,
                                            epochs=200, lr=0.3, 
                                            learner_cls=permutationgraph.BatchLearner)
print(f'{len(min_vals[min_vals>0])} valid queries. --> average: {min_vals[min_vals>0].mean()}')

DTR.ndcg_dtr(exposure2020, ds2020.telv, y_rerank2020, ds2020.tedlr, ds2020.teg, ds2020.query_seq)

---

# Nonrelevant Tests

In [None]:
df = pd.DataFrame({'qid':list(np.repeat(ds2019.teqid, np.diff(ds2019.tedlr))), 'group':list(ds2019.teg), 'label':list(ds2019.telv), 'pred':list(y_pred2019)})

In [None]:
n = 4
learner = permutationgraph.QueryLearner(np.zeros(4), np.arange(n), None, sorted_docs = None)
learner.probs_mat = 0.5 * np.ones([learner.n, learner.n])

def per2int(docs):
    return (np.array([10**(len(docs)-1-i) for i in range(len(docs))])*docs).sum()
freq = defaultdict(lambda:0)
iters = 50000
for _ in range(iters):
    docs, crap = learner.permute()
    freq[per2int(docs)] += 2.**(n+1)/iters
a = sorted(freq.items(),key=lambda x: x[1])
a

In [None]:

def linspan(y_pred, levels):
    m,M = y_pred.min()-1e-10, y_pred.max()+1e-10
    m = max(m,0)
    step = (M - m) / levels
    return np.floor((y_pred-m)/step)


def disc_target_exposure(y, exposure):
    sorted_y = np.sort(y)[::-1]
    expo = exposure[:len(y)]
    te = []
    for g in range(int(y.max()+1)):
        te.append(np.mean(expo[sorted_y==g]))
    return np.array(te)
y_pred = np.random.rand(20)


exposure = np.array([1./np.log2(2+i) for i in range(1,1000+2)])

print(y_pred)
a = linspan(y_pred, 5)
b = disc_target_exposure(linspan(y_pred, 5), exposure)
[a,b[a.astype(int)]]


In [None]:

def get_edges(args):
    edges = 0
    for i in range(len(args)):
        for j in range(i+1, len(args)):
            if args[i] > args[j]:
                edges += 1
    return edges

def sample(PPG):
    n = PPG.shape[0]
    if n <= 1:
        return np.arange(n)
    selected = np.random.binomial(1,PPG)
    positions = np.arange(n) + selected.sum(1) - selected.sum(0)
    # print(positions)
    empty_positions = []
    for i in range(n):
        shared_i_s = np.where(positions == i)[0]
        if len(shared_i_s) <= 1:
            if len(shared_i_s) == 0:
                empty_positions.append(i)
            continue
        chosen_i = np.random.choice(shared_i_s)
        for j in shared_i_s:
            if j == chosen_i:
                continue
            positions[j] = -1
    remaining = np.where(positions == -1)[0]
    # print(remaining)
    if len(remaining) > 0:
        PPG2 = PPG[remaining,:][:,remaining]
        positions2 = sample(PPG2)
        positions[remaining] = np.array(empty_positions)[positions2]
    return positions


n = 5
prob = 0.3
PPG = prob * np.triu(np.ones((n,n)), 1)

from collections import defaultdict
from tqdm.notebook import trange

freq = defaultdict(lambda:defaultdict(lambda:0))

iters = 100000
for i in trange(iters):
    positions = sample(PPG)
    edges = get_edges(positions)
    freq[edges][str(positions)] += 1./iters



In [None]:
for edge in freq:
    freq_ = freq[edge]

    vals = np.array(list(freq_.values()))
    p = (prob**edge)*((1.-prob)**((n*(n-1)/2)-edge))
    print([edge, len(vals), vals.mean()/p, vals.mean(), vals.std(), vals.min(), vals.max(), p])

In [None]:
sorted(list(freq[7].items()), key=lambda x: x[1])

In [None]:
import math
def ratio(n):
    print([n, np.sqrt(((n*(n-1)/2.) - np.log2(math.factorial(n)))*2)])

for i in range(5,20):
    ratio(i)

In [None]:
def _neighbors(mat):
    neigh = {'upper':{}, 'lower':{}}
    n = mat.shape[0]
    for i in range(n):
        neigh['upper'][i] = []
        for j in range(i):
            if mat[j,i] == 1:
                neigh['upper'][i].append(j)
    for i in range(n):
        neigh['lower'][i] = []
        for j in range(i+1,n):
            if mat[i,j] == 1:
                neigh['lower'][i].append(j)
    return neigh

def _insert_to_down(merged, PPG, i_u, up):
    Nu = PPG.shape[0]
    # print('inserting index', i_u)
    # print('merged:', merged)
    # print('PPG:', PPG)

    if i_u < up.shape[0] - 1:
        after_ind = int(np.where(merged == up[i_u + 1])[0])
    else:
        after_ind = merged.shape[0]

    if after_ind == i_u + 1:
        # print('no space to move')
        return

    for i_d in range(i_u+1, after_ind):
        q_u, q_d = 0, 0
        
        for k in range(i_d+1, after_ind):
            q_d = q_d * (1. - PPG[merged[i_u]][merged[k]]) + PPG[merged[i_u]][merged[k]]

        for k in range(i_u):
            q_u = q_u * (1. - PPG[merged[k]][merged[i_d]]) + PPG[merged[k]][merged[i_d]]

        q = q_u + q_d - (q_u * q_d)
        q *= 1. - PPG[merged[i_u]][merged[i_d]]
        if np.random.binomial(1, PPG[merged[i_u]][merged[i_d]] / (1. - q)) == 0:
            break

    # print('q_u:', q_u, 'q_d:', q_d, 'q:', q, 'p:', PPG[i_u][i_d])
    if i_d > i_u + 1:
        shift = merged[i_u+1:i_d]
        merged_i_u = merged[i_u]
        merged[i_u:i_d-1] = shift
        merged[i_d-1] = merged_i_u

    
def get_permutation(selected):
    return np.arange(selected.shape[0]) + selected.sum(1) - selected.sum(0)

def PPG_merge(up, down, PPG):
    Nu = up.shape[0]
    Nd = down.shape[0]
    
    down += Nu
    merged = np.concatenate([up, down])
    # print('merge -> up:', up)
    # print('down:', down)
    # print('PPG:', PPG)

    for i_u in reversed(range(Nu)):
        _insert_to_down(merged, PPG, i_u, up)
    return merged

def PPG_sample(PPG):
    n = PPG.shape[0]
    mid = n // 2
    # print('main:', n, mid)
    if n == 1:
        return np.array([0])
    if n == 2:
        if np.random.binomial(1,PPG[0,1]):
            return np.array([1,0])
        return np.array([0,1])
    up = PPG_sample(PPG[:mid,:][:,:mid])
    down = PPG_sample(PPG[mid:,:][:,mid:])
    mat = PPG_merge(up, down, PPG)
    # print('PPG:', PPG)
    # print('mat:', mat)
    return mat



In [None]:

n = 5
prob = 0.5 * np.random.rand(n,n)
PPG = prob * np.triu(np.ones((n,n)), 1)

from collections import defaultdict
from tqdm.notebook import trange


def get_edges(args):
    edges = 0
    for i in range(len(args)):
        for j in range(i+1, len(args)):
            if args[i] > args[j]:
                edges += 1
    return edges
    
freq = defaultdict(lambda:defaultdict(lambda:0))

iters = 1000
for i in trange(iters):
    positions = PPG_sample(PPG)
    edges = get_edges(positions)
    freq[edges][str(positions)] += 1./iters
# print(positions)

In [None]:
for edge in freq:
    freq_ = freq[edge]

    vals = np.array(list(freq_.values()))
    p = (prob**edge)*((1.-prob)**((n*(n-1)/2)-edge))
    print('edges:', edge, 'count:', len(vals), 'ratio:', vals.mean()/p, 'max to min:', vals.max()/vals.min())

for edge in freq:
    freq_ = list(freq[edge].items())
    print(freq_)

In [None]:
import numpy as np
import torch

x = np.array([[1,2,3],[4,5,6]])
y = np.array([[0,2,1],[2,1,0]])
x = torch.FloatTensor(x)
y = torch.LongTensor(y)
x[torch.arange(x.shape[0]).unsqueeze(1).repeat((1,3)).flatten(), y.flatten()].view(2,3)

In [None]:
from itertools import permutations
list(permutations(range(4)))