In [1]:
import numpy as np
from xgboost import XGBRegressor
from sklearn.datasets import load_svmlight_file
from collections import defaultdict
from multiprocessing import Pool, Array
import time
import dill
import tqdm
from tqdm import tqdm_notebook as tqdm

In [6]:
!mkdir prediction_dumps

In [2]:
def idcg(rel):
    dcg = np.sum((2.0**(np.sort(rel)[::-1]) - 1.0) / (np.log(1.0 + np.arange(rel.shape[0])) + 1.0))
    if np.isclose(dcg, 0.0):
        return 1.0
    else:
        return 1.0 / dcg
def dcg(rel):
    return np.sum((2**rel - 1) / (np.log(1.0 + np.arange(rel.shape[0])) + 1.0))

In [3]:
X, y, query_ids = load_svmlight_file('train.txt', query_id=True)
X_test, y_test, query_ids_test = load_svmlight_file('test.txt', query_id=True)

In [12]:
train_queries = dict()

for doc_id, query_id in enumerate(query_ids):
    if query_id in train_queries:
      train_queries[query_id].append(doc_id)
    else:
      train_queries[query_id] = [doc_id]

In [13]:
def queries_bagging(new_ids):
  ids = dict()
  for k, i in enumerate(new_ids):
    if i in ids:
      ids[i].append(k)
    else:
      ids[i] = [k]
  bs_queries = defaultdict(list)
  bs_local = defaultdict(list)
  bs_global = defaultdict(list)
  for query_id in train_queries:
    counter=0
    for d in train_queries[query_id]:
      if d in ids:
        bs_queries[query_id].append(ids[d][0])
        bs_global[query_id].extend(ids[d])
        bs_local[query_id].extend([counter for i in range(len(ids[d]))])
        counter += 1
  return bs_queries, bs_local, bs_global

In [14]:
def utilities(y, bs_queries):
  query_P_ij = dict()
  logs_q = dict()
  idcgs = dict()
  a_powered = dict()
  for query_id in bs_queries:
    docs = np.array(bs_queries[query_id])
    a = y[docs]
    S_ij = np.zeros((a.shape[0], a.shape[0]))
    S_ij += a.reshape(-1, 1) > a
    S_ij -= a.reshape(-1, 1) < a
    P_ij = 0.5*(1 + S_ij)
    logs_ = 1.0/ (np.log(np.arange(a.shape[0]) + 1.0) + 1.0)
    logs_ = logs_.reshape(-1, 1) - logs_
    query_P_ij[query_id] = P_ij
    logs_q[query_id] = logs_
    idcgs[query_id] = idcg(a)
    a_powered[query_id] = 2**a
  return query_P_ij, logs_q, idcgs, a_powered

In [15]:
def get_absent(new_ids):
  s = set(new_ids)
  t = range(0, len(new_ids))
  absent_ids = list(filter(lambda x: x not in s, t))
  return absent_ids

In [16]:
def objective(y_true, y_pred):

    global progress_bar
    progress_bar.update(1)
    global_grad = np.zeros(y_true.shape[0])
    global_hess = np.zeros(y_true.shape[0])
    
    
    
    for query_id in bs_queries:
        docs = np.array(bs_queries[query_id])
        a = y_true[docs]
        h = y_pred[docs]
        
        argsorted = np.argsort(h)
        ts = a_powered[query_id][argsorted]
        tpq = ts.reshape(-1, 1) - ts
        logs = logs_q[query_id]
        deltas = tpq*logs
        idcg_cur = idcgs[query_id]
        deltas = np.abs(idcg_cur*deltas)
        P_ij = query_P_ij[query_id]
        h_ij = h.reshape(-1, 1) - h
        p_ij = -1/(1 + np.exp(h_ij))
        fpq = P_ij*deltas
        T = fpq*p_ij
        grad = np.sum(-T + np.transpose(T), axis=0)
        p_ij_ = np.exp(-h_ij)/ (1 + np.exp(-h_ij))**2
        T_ = fpq*p_ij_
        hess = np.sum(T_ + np.transpose(T_), axis=1) - 2*T_.diagonal()
        global_grad[bs_global[query_id]] = grad[bs_local[query_id]]
        global_hess[bs_global[query_id]] = hess[bs_local[query_id]]
    
    return global_grad, global_hess

In [17]:
def calc_full_ndcg(y_true, y_pred):
  ndcgs = []
  for query_id in train_queries:
    docs = train_queries[query_id]
    ndcg = idcg(y_true[docs])*dcg(y_true[np.argsort(y_pred[docs])])
    ndcgs.append(ndcg)
  return np.mean(ndcgs)

***84 XGBRegressor, max_depth=12, n_estimators=300, lr=0.1, public_score=0.75201, private_score=0.76041***

In [None]:
#y_validation = np.zeros_like(y)
for i in range(0, 84):
  new_ids = np.random.randint(low=0, high = X.shape[0], size=X.shape[0])
  bs_queries, bs_local, bs_global = queries_bagging(new_ids)
  query_P_ij, logs_q, idcgs, a_powered = utilities(y[new_ids], bs_queries)
  params = {'objective': objective, 'max_depth': 12, 'n_estimators': 300, 'tree_method':'gpu_hist', 'lr':0.1}
  model = XGBRegressor(**params)
  progress_bar = tqdm(np.arange(300))
  model.fit(X[new_ids], y[new_ids])
  #absent_ids = get_absent(new_ids)
  #val_pred = model.predict(X[absent_ids])
  #y_validation[absent_ids] += val_pred
  cur_pred = model.predict(X_test)
  np.savetxt('prediction_dumps/test_prediction' + str(i) + '.dump', cur_pred)
  #dill.dump(y_validation, open(path + '/bagging_dump_300/out_of_bag_validation/' + 'y_validation' + str(i) + '.dump', 'wb'))
  #dill.dump(model, open(path + '/bagging_dump_300/model_dumps/' + 'model' + str(i) + '.dump', 'wb'))
  #dill.dump(cur_pred, open(path + '/bagging_dump_300/prediction_dumps/' + 'test_prediction' + str(i) + '.dump', 'wb'))
  progress_bar.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))

In [5]:
y_pred = np.zeros_like(y_test)
for i in range(0, 84):
  cur_pred = np.loadtxt('prediction_dumps/test_prediction' + str(i) + '.dump')
  y_pred += cur_pred

_________________________________________________________________________________________________

In [6]:
test_queries = defaultdict(list)
for doc_id, query_id in enumerate(query_ids_test):
  test_queries[query_id].append(doc_id)

In [8]:
with open('submission_f.csv', 'w') as submission:
  string = "QueryId,DocumentId\n"
  for query_id in test_queries:
    docs = test_queries[query_id]
    y_pred_cur = y_pred[docs]
    sorted_ids = np.argsort(y_pred_cur)[::-1]
    sorted_docs = np.array(docs)[sorted_ids]
    for doc_id in sorted_docs:
      string += str(query_id) + ',' + str(doc_id + 1) + '\n'
  submission.write(string)