# Testing on PISA dataset with randomized missing data

1. Importing package and dataset

In [18]:
!pip install iita_python
!git clone https://gist.github.com/717f0147675b0c8ed25e50d583c943bf.git

import numpy as np
import iita_python as iita
import iita_python.fit_metrics as iita_fm
from iita_python.utils import read_rp
from random import randint, choice, shuffle

fatal: destination path '717f0147675b0c8ed25e50d583c943bf' already exists and is not an empty directory.


2. Testing function

In [2]:
def test(metric, skips):
  correct = True
  correct_qo = None
  correct_count = 0
  data = read_rp('./717f0147675b0c8ed25e50d583c943bf/pisa.csv')

  while (correct and correct_count < data.shape[0]*data.shape[1] - 10):
    print(correct_count)
    test_dataset = iita.Dataset(data)
    unfolded_ce = iita.unfold_examples(test_dataset.ce)
    qos = iita.ind_gen(unfolded_ce, test_dataset.items)

    best_qo_id = -1
    best_qo_diff = float('inf')
    for i, qo in enumerate(qos):
      qo_diff = metric(test_dataset, qo)
      if (qo_diff < best_qo_diff):
        best_qo_diff = qo_diff
        best_qo_id = i

    best_qo = sorted([(int(a), int(b)) for a, b in qos[best_qo_id].get_edge_list()])
    if (correct_qo is None):
      correct_qo = best_qo

    if (best_qo != correct_qo):
      correct = False
    else:
      correct_count += skips

      for _ in range(skips):
        while (True):
          a = randint(0, test_dataset.items - 1)
          b = randint(0, test_dataset.subjects - 1)
          # print(f'trying {a} {b}')
          # print(data.loc[b, a])
          if (not (np.isnan(data.loc[b, a]) or (np.nansum(data.to_numpy(), axis=0)[a] == 1))):
            break;
        data.loc[b, a] = np.nan

  return correct_count

In [21]:
def iter_test(metric, skips, iters, testf, **kwargs):
  iter_res = []
  for i in range(iters):
    print(f'ITER {i}')

    res = testf(metric, skips, **kwargs)
    print(res)
    iter_res.append(res)
  return iter_res

3. Running the tests

In [None]:
res = iter_test(iita_fm.mini_iita_fit, 5, 100, test)

4. Analyzing the tests

Running the tests is quite a long process, so I did it for 100 iterations and saved the results in the next cell

Skip the next cell if you run the tests on your own

In [4]:
res = [
  265, 610, 445, 410, 490, 365, 590, 645, 495, 270,
  595, 275, 330, 580, 215, 585, 220, 605, 300, 480,
  560, 590, 480, 210, 205, 605, 555, 475, 480, 565,
  625, 385, 490, 555, 600, 305, 375, 280, 495, 335,
  445, 495, 555, 480, 780, 445, 265, 195, 545, 510,
  390, 345, 420, 335, 200, 445, 510, 470, 350, 285,
  610, 315, 545, 85, 550, 640, 555, 160, 470, 470,
  490, 505, 465, 555, 520, 80, 415, 380, 455, 240,
  670, 490, 220, 595, 115, 125, 410, 295, 155, 265,
  605, 460, 565, 495, 145, 250, 345, 510, 450, 485
]

In [5]:
data = read_rp('./717f0147675b0c8ed25e50d583c943bf/pisa.csv')
res = (np.array(res) / (data.shape[0] * data.shape[1])).round(3)

4.1. Average

In [6]:
np.mean(res).round(3)

np.float64(0.25)

5. Testing function for biased item choice

In [23]:
def test_biased_items(metric, skips, bias):
  correct = True
  correct_qo = None
  correct_count = 0
  data = read_rp('./717f0147675b0c8ed25e50d583c943bf/pisa.csv')

  choicePool = []
  items = list(range(data.shape[1]))
  shuffle(items)

  for i, item in enumerate(items):
    for _ in range(bias[i]):
      choicePool.append(item)

  while (correct and correct_count < data.shape[0]*data.shape[1] - 10):
    print(correct_count)
    test_dataset = iita.Dataset(data)
    unfolded_ce = iita.unfold_examples(test_dataset.ce)
    qos = iita.ind_gen(unfolded_ce, test_dataset.items)

    best_qo_id = -1
    best_qo_diff = float('inf')
    for i, qo in enumerate(qos):
      qo_diff = metric(test_dataset, qo)
      if (qo_diff < best_qo_diff):
        best_qo_diff = qo_diff
        best_qo_id = i

    best_qo = sorted([(int(a), int(b)) for a, b in qos[best_qo_id].get_edge_list()])
    if (correct_qo is None):
      correct_qo = best_qo

    if (best_qo != correct_qo):
      correct = False
    else:
      correct_count += skips

      for _ in range(skips):
        while (True):
          a = choice(choicePool)
          b = randint(0, test_dataset.subjects - 1)
          # print(f'trying {a} {b}')
          # print(data.loc[b, a])
          if (not (np.isnan(data.loc[b, a]) or (np.nansum(data.to_numpy(), axis=0)[a] == 1))):
            break;
        data.loc[b, a] = np.nan

  return correct_count

In [25]:
res_biased = iter_test(iita_fm.mini_iita_fit, 5, 40, test_biased_items, bias=[15, 11, 5, 3, 2])

ITER 0
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
250
ITER 1
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
140
ITER 2
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
205
ITER 3
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
185
ITER 4
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
220
ITER 5
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
260
265
270
275
280
285
290
295
300
300
IT

In [29]:
res_biased = (np.array(res_biased) / (data.shape[0] * data.shape[1])).round(3)

In [31]:
res_biased

array([0.147, 0.082, 0.121, 0.109, 0.129, 0.176, 0.182, 0.097, 0.094,
       0.047, 0.132, 0.126, 0.118, 0.103, 0.147, 0.15 , 0.15 , 0.118,
       0.106, 0.085, 0.1  , 0.126, 0.082, 0.291, 0.079, 0.062, 0.106,
       0.129, 0.291, 0.094, 0.1  , 0.165, 0.174, 0.221, 0.079, 0.218,
       0.103, 0.047, 0.074, 0.082])

Running the tests is quite a long process, so I did it for 100 iterations and saved the results in the next cell

Skip the next cell if you run the tests on your own

In [None]:
res_biased = np.array([0.147, 0.082, 0.121, 0.109, 0.129, 0.176, 0.182, 0.097, 0.094,
       0.047, 0.132, 0.126, 0.118, 0.103, 0.147, 0.15 , 0.15 , 0.118,
       0.106, 0.085, 0.1  , 0.126, 0.082, 0.291, 0.079, 0.062, 0.106,
       0.129, 0.291, 0.094, 0.1  , 0.165, 0.174, 0.221, 0.079, 0.218,
       0.103, 0.047, 0.074, 0.082])

In [30]:
np.mean(res_biased).round(3)

np.float64(0.126)