In [2]:
import pandas as pd
import numpy as np
import json
import os
import json
import torch
import wandb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook

# Q1: What are the top instances for each top performing model?

MIL model output is of format

```
{
    'predictions': (bag_probs, instance_probs per bag, key instance ids),
    'label_ids': bag labels,
    'metrics': {}
}
```

In [4]:
model_dir = f"{os.environ['MINERVA_HOME']}/models/ratio_0.6"
with open(f"{model_dir}/test_predict.bin", "rb") as f:
    predictions = torch.load(f)

In [14]:
data_dir = f"{os.environ['MINERVA_HOME']}/data/premade_mil/minimum_10"
df = pd.read_json(f"{data_dir}/test.jsonl", lines=True)


Re-structure data for easier analysis

In [17]:
data = []

def temp(row):
    for i in row["instances"]:
        data.append([
            row["bag_id"],
            row["mil_bag_prob"],
            row["label"],
            i["id_str"],
            i["tweet_text"],
            i["instance_score"]
        ])

df["mil_bag_prob"] = predictions["predictions"][0]
df.apply(temp, axis="columns")
instance_df = pd.DataFrame(data, columns=["bag_id", "mil_score", "bag_label", "instance_id", "text", "instance_score"])


In [21]:
# Dictionary lookup of instance ID -> predicted instance score
instance_score_lookup = {}
for instance_id, score in zip(predictions["predictions"][2].flatten(), predictions["predictions"][1].flatten()):
    instance_score_lookup[instance_id] = score

In [24]:
instance_df["mil_instance_score"] = instance_df.instance_id.map(instance_score_lookup.get)

In [29]:
instance_df

Unnamed: 0,bag_id,mil_score,bag_label,instance_id,text,instance_score,mil_instance_score
0,2018_02_21_PAK,0.992647,1,966154472480169984,@HNajibullah your father had a similar dream o...,0.999410,
1,2018_02_21_PAK,0.992647,1,966250698957578240,@HarounRashid2 Only in Pakistan,0.000238,
2,2018_02_21_PAK,0.992647,1,966320622606409728,@_GhulamMustafa_ Shame on u gen,0.000305,
3,2018_02_21_PAK,0.992647,1,966220320003719168,Guys our very own cute celeb @Alizayhere (16.3...,0.000195,
4,2018_02_21_PAK,0.992647,1,966178811736940544,Hareem Farooq to host PSL opening ceremony,0.000248,
...,...,...,...,...,...,...,...
10057412,2019_09_23_GIN,0.049270,0,1176115722310713344,"current weather in Conakry: broken clouds, 26°...",0.000221,
10057413,2019_09_23_GIN,0.049270,0,1176121916798709760,@chvmxd Givenchy encore plus,0.000238,
10057414,2019_09_23_GIN,0.049270,0,1176154761843941376,@YaPasRienLa09 Je follow back,0.000219,
10057415,2019_09_23_GIN,0.049270,0,1176164905852030976,This is perhaps what those pple are there for ...,0.000200,


In [30]:
instance_score_lookup

{9.663356e+17: 0.9993049,
 9.6639245e+17: 0.95887744,
 9.662934e+17: 0.94343364,
 9.66343e+17: 0.99985933,
 9.663609e+17: 0.9960224,
 9.662203e+17: 0.998259,
 9.6634345e+17: 0.99982375,
 9.661345e+17: 0.95825344,
 9.66352e+17: 9.5328876e-05,
 9.6637176e+17: 9.712828e-05,
 9.663368e+17: 0.9723394,
 9.6626346e+17: 0.000106373576,
 9.663351e+17: 0.9969125,
 9.661562e+17: 9.805629e-05,
 9.662401e+17: 0.9998845,
 9.663288e+17: 0.9992384,
 9.663451e+17: 0.99985874,
 9.661727e+17: 0.99956495,
 9.662296e+17: 0.99986744,
 9.661643e+17: 0.9997527,
 9.662672e+17: 0.9631273,
 9.661545e+17: 0.00031393592,
 9.6634964e+17: 0.99931586,
 9.662249e+17: 0.00014308293,
 9.663207e+17: 0.99927276,
 9.6631356e+17: 0.99950945,
 9.662238e+17: 0.00012378808,
 9.662333e+17: 9.4646195e-05,
 9.6637067e+17: 9.4699455e-05,
 9.6632476e+17: 0.00010490738,
 9.663137e+17: 0.9994573,
 9.661885e+17: 0.9727077,
 9.663079e+17: 0.00014084655,
 9.663975e+17: 0.99912924,
 9.6623556e+17: 0.0001605452,
 9.663251e+17: 0.99983907,

In [10]:
predictions["predictions"][0].shape

(26779, 1)

In [11]:
predictions["predictions"][1].shape

(26779, 1, 100)

In [20]:
predictions["predictions"][2].flatten().shape

(2677900,)

In [13]:
predictions

{'predictions': (array([[0.9926466 ],
         [0.17986313],
         [0.5337394 ],
         ...,
         [0.14011621],
         [0.4593775 ],
         [0.04926971]], dtype=float32),
  array([[[9.9930489e-01, 9.5887744e-01, 9.4343364e-01, ...,
           9.8543453e-01, 9.0661860e-01, 9.8090345e-01]],
  
         [[9.4736133e-05, 1.0114046e-04, 1.1708670e-04, ...,
           9.9982446e-01, 9.9986756e-01, 1.1303458e-04]],
  
         [[1.4735418e-04, 9.9887604e-01, 1.0171700e-04, ...,
           9.7990394e-01, 9.7990394e-01, 9.7990394e-01]],
  
         ...,
  
         [[1.3474507e-04, 1.7170198e-04, 1.6581712e-04, ...,
           9.7990394e-01, 9.7990394e-01, 9.7990394e-01]],
  
         [[5.4752443e-04, 1.0158371e-04, 9.5860880e-05, ...,
           1.0221765e-04, 6.7605871e-01, 1.4699718e-04]],
  
         [[4.1774384e-04, 2.4149953e-04, 1.4828783e-01, ...,
           9.7990394e-01, 9.7990394e-01, 9.7990394e-01]]], dtype=float32),
  array([[[ 9.66335618e+17,  9.66392449e+17,  9.66293

In [23]:
len(instance_score_lookup)

1023041

In [32]:
int(9.6632936e+17)

966329360000000000

In [33]:
966329360000000000 in instance_score_lookup

False

In [35]:
np.set_printoptions(precision=100)
predictions

{'predictions': (array([[0.9926466  ],
         [0.17986313 ],
         [0.5337394  ],
         ...,
         [0.14011621 ],
         [0.4593775  ],
         [0.049269706]], dtype=float32),
  array([[[9.9930489e-01, 9.5887744e-01, 9.4343364e-01, ...,
           9.8543453e-01, 9.0661860e-01, 9.8090345e-01]],
  
         [[9.4736133e-05, 1.0114046e-04, 1.1708670e-04, ...,
           9.9982446e-01, 9.9986756e-01, 1.1303458e-04]],
  
         [[1.4735418e-04, 9.9887604e-01, 1.0171700e-04, ...,
           9.7990394e-01, 9.7990394e-01, 9.7990394e-01]],
  
         ...,
  
         [[1.3474507e-04, 1.7170198e-04, 1.6581712e-04, ...,
           9.7990394e-01, 9.7990394e-01, 9.7990394e-01]],
  
         [[5.4752443e-04, 1.0158371e-04, 9.5860880e-05, ...,
           1.0221765e-04, 6.7605871e-01, 1.4699718e-04]],
  
         [[4.1774384e-04, 2.4149953e-04, 1.4828783e-01, ...,
           9.7990394e-01, 9.7990394e-01, 9.7990394e-01]]], dtype=float32),
  array([[[ 9.66335618e+17,  9.66392449e+17,  9