In [37]:
import numpy as np
import scipy as sp
from scipy.io import mmread
from scipy.stats import entropy
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances, f1_score

In [38]:
x_labeled = mmread('sparse_docs_MatrixMarket_initial_batch.mtx')
x_labeled.shape

(2000, 11436)

In [39]:
x_unlabeled = mmread('sparse_docs_MatrixMarket_data_pool.mtx')
x_unlabeled.shape

(100000, 11436)

In [40]:
with open('sparse_docs_MatrixMarket_initial_batch_labels.txt') as file:
      y_test = file.readlines()

y = [row[:-1].split(',') for row in y_test]
y = [[ord(label)-65 for label in row] for row in y]

y_labeled = np.zeros((len(y), 11))
for i in range(len(y)):
    y_labeled[i, y[i]] = 1

In [41]:
multilabel_model = MultiOutputClassifier(xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    tree_method='hist',
    min_child_weight=1,
    eta=0.1,
    max_depth=3,
    reg_lambda=1,
    subsample=1,
    colsample_bytree=1,
    nrounds=150))

multilabel_model.fit(x_labeled, y_labeled)

MultiOutputClassifier(estimator=XGBClassifier(eta=0.1, eval_metric='auc',
                                              nrounds=150, tree_method='hist'))

In [42]:
pred_dist = multilabel_model.predict_proba(x_unlabeled)
pred_dist = np.array(pred_dist)
pred_dist = np.transpose(pred_dist, (1, 0, 2))
pred_dist = np.squeeze(pred_dist[..., 1])

labels = multilabel_model.predict(x_unlabeled)

target_dist = labels.mean(axis=0)
c = (pred_dist / target_dist + (1-pred_dist) / (1-target_dist))
pred_dist = pred_dist / target_dist
pred_dist = pred_dist / c

In [43]:
unc = []
for p in pred_dist:
    ent = -sum(np.log(p_) * p_ + np.log(1-p_) * (1-p_) for p_ in p)
    unc.append(ent)

unc = np.array(unc)
print(unc.mean())

5.998705512247281


In [44]:
top_50 = np.argpartition(unc, -50)[-50:]+1
top_200 = np.argpartition(unc, -200)[-200:]+1
top_500 = np.argpartition(unc, -500)[-500:]+1

In [45]:
print(top_50)

[95400  4076 94154 85860 78505 81925 76314 15823  9074 83000 90207 17296
 27000 86131  3655  1891  3765  4091 60353 68084 43945 10411  2042 64343
 17412 50835 53735 74023 95660  9401 58272 41515 33831 45174 40919 22207
 21768 47179 32841 55187 68591 19260 67724 47066 30628 46999 48265 15769
 70085 12610]


In [46]:
res = ''
for top_k in [top_50, top_200, top_500]:
    s = ','
    s=s.join(map(str, top_k))
    res += s+'\n'

with open("results.txt", "w") as file:
    file.write(res)

# Report

My final solution uses only scaled uncertanity (using entropy).

I tried also different methods, unfortunetly those methods didn't work well.

I used similarity and dissimilarity, but achieved worse results.

I also tried to use XGBooster with different parameters. I thought that maybe with some changed parameters models would better indicate the most uncertain examples, but results were worst.