# Baseline models

In [28]:
import numpy as np
import random
from collections import Counter

In [81]:
path = '/home/user/venvs/doctorAI/MIMICIII_data/'
output_path = '/home/user/venvs/drAI+/drAIplus/models/doctorAI_py3/'

In [3]:
import pickle

visit_file = pickle.load( open(path+'visit_complete', 'rb') )
label_file = pickle.load( open(path+'label_complete', 'rb') )

### Frequency baselines. 

We compare our algorithms against simple baselines that are based on experts’ intuition about the dynamics of events in clinical settings. 

* The first baseline uses a patient’s medical codes in the last visit as the prediction for the current visit. This baseline is competitive when the status of a patient with a chronic condition stabilizes over time. 
* We enhanced this baseline using the top-k most frequent labels observed in visits prior to the current visits. In the experiments we observe that the baseline of top-k most frequent labels is quite competitive.

top-k recall = $\frac{\text{# of true positives in the top k predictions}}{\text{# of true positives}}$


Recall multilabel (https://stackoverflow.com/questions/9004172/precision-recall-for-multiclass-multilabel-classification):

The ratio of how many of the actual labels were predicted. The numerator finds how many labels in the predicted vector has common with the ground truth (as above), then finds the ratio to the number of actual labels, therefore getting what fraction of the actual labels were predicted.

In [4]:
def multi_label_recall(Y_true, Y_pred):
    #quanti sono gli items comuni tra predizione e truth?
    #quanti sono gli items che dovevo indovinare (lunghezza del vettore truth)?
    #divido il numero di items comuni per il numero di items che avrei dovuto azzeccare
    recall = []
    for y_true, y_pred in zip(Y_true,Y_pred):
        patient_recall = len(set(y_pred).intersection(set(y_true)))*1./len(set(y_true))
        recall.append(patient_recall)
    return np.mean(recall)

In [5]:
def recallTop(y_true, y_pred, rank=[10, 20, 30]):
    recall = list()
    for i in range(len(y_pred)):
        thisOne = list()
        codes = y_true[i]
        tops = y_pred[i]
        for rk in rank:
            thisOne.append(len(set(codes).intersection(set(tops[:rk])))*1.0/len(set(codes)))
        recall.append( thisOne )
    return (np.array(recall)).mean(axis=0).tolist()

#### Last visit is current visit

In [6]:
#last visit codes, this time codes
y_true = []
y_pred = []

for patient in label_file:
    n_visits = len(patient)
    current_visit = patient[n_visits-1]
    last_visit = patient[n_visits-2]
    y_true.append(current_visit)
    y_pred.append(last_visit)

In [7]:
multi_label_recall(y_true,y_pred)

0.44887523562649606

#### Most frequent

In [8]:
#top k most frequent labels in prior visits

In [9]:
y_pred = []
for patient in label_file:
    tops = sorted([x for x in set([a for b in patient[:-1] for a in b])],key=lambda y: ([a for b in patient[:-1] for a in b].count(y),y),reverse=True)
    y_pred.append(tops)

In [10]:
recallTop(y_true, y_pred)

[0.3828614784432503, 0.4732631519257384, 0.4909308495553468]

### Random models
Let's see doctorAI performances on random patients

In [67]:
pippo = [a for b in visit_file for a in b]
max([a for b in pippo for a in b])

4879

In [55]:
#voglio generare pazienti sintetici che abbiano senso, almeno a grandi linee
#genero pazienti con un numero di visite sensato (sample from population with replacement)
#genero visite con un numero sensato di codici

#numero reale di visite per paziente
n_patients_visit = [len(patient) for patient in visit_file]
#numero reale di codici per visita 
n_codes_in_visit = [len(visit) for visit in patient for patient in visit_file]

In [75]:
#genero pazienti con un numero di visite sensato, ma con codici a caso tra 0 a 4880 estratti con probabilità uniforme

#per ogni paziente
n_random_patients = 8000
random_patients = []
for i in np.arange(n_random_patients):
    random_patient = []
    #genero un numero di visite sensato
    random_visit_len = random.choices(n_patients_visit,k=1)[0]
    random_visits = []
    for j in np.arange(random_visit_len):
        random_codes_len = random.choices(n_codes_in_visit,k=1)[0]
        random_visits.append(random.choices(np.arange(0,4880),k=random_codes_len))
    random_patient.append(random_visits)
    
    random_patients.append(random_patient)

In [82]:
pickle.dump(random_patients, open(output_path+'fake_visit.test', 'wb'), protocol=-1)