# Protocol adherence by red flag detection

*Notebook with python code for experiment setup and running.*

***Setup***

Python packages:

In [2]:
import time
import random
import spacy
import nltk
import Levenshtein

import numpy as np
import pandas as pd

from nltk.stem import RSLPStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE, ADASYN

import torch

from statisticalframework import *

NLTK stuff:

Firstly, run it on a *terminal*:
```
python -m spacy download pt_core_news_lg
```

In [3]:
nltk.download('punkt')
nltk.download( "rslp" )

stopwords = nltk.corpus.stopwords.words("portuguese")

[nltk_data] Downloading package punkt to /Users/drt67700/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /Users/drt67700/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


**Instanciation of the analytical pipeline**

In [4]:
statframe = StatisticalFramework(
    parameters_filepath = './data/parametros_redflag_com_covid19.xlsx',
    rule_parameter = 'REDFLAG'
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
No sentence-transformers model found with name /Users/drt67700/.cache/torch/sentence_transformers/neuralmind_bert-large-portuguese-cased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/drt67700/.cache/torch/sentence_transformers/neuralmind_bert-large-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.

**Data**

Main dataset:

In [5]:
# TODO: se possível, atualizar dados (obter maior volume e mais recentes)

df = pd.read_csv('data/dataset_clean_20221104.csv')

In [6]:
df.shape

(14051, 16)

Excluding too short `clinical_impression` data:

In [9]:
min_words = 6  # numero minimo de palavras no campo "clinical_impression"

In [10]:
idx = df['clinical_impression'].apply(lambda x: len(x.split()) > min_words)

In [11]:
df = df[idx].reset_index(drop=True)

In [12]:
df.reset_index(inplace=True)

In [13]:
# tamanho do conjunto de dados depois
df.shape  # (10174, 16)

(10174, 17)

Examples and counterexamples:

In [11]:
df_examples = df.query('CID_description.str.lower().str.contains("dengue")', engine='python')

In [12]:
# TODO: refazer um código "oficial" para os counterexamples

df_counterexamples = df.query('~CID_description.str.lower().str.contains("dengue")', engine='python') #.loc[205].clinical_impression

In [13]:
df_examples.shape, df_counterexamples.shape

((318, 17), (9856, 17))

In [14]:
# only run it only if necessary

# df_examples[['CID', 'clinical_impression']].to_excel('df_examples_20230503.xlsx')

In [15]:
# only run it only if necessary

# df_counterexamples[['CID', 'clinical_impression']].to_excel('df_counterexamples_20230503.xlsx')

Additional (associated) red flag data (for the `examples` dataset):

*a) inspection*

In [16]:
text_flag = "com sinais de alarme"

[x for x in  df_examples.clinical_impression.str.lower().values if text_flag in x]

['com sinais de alarme. = suspeita de dengue ?? / chicungunha ?? ',
 'com sinais de alarme - suspeita de dengue.',
 'com sinais de alarme - prostração (já excluído diagnóstico de covid por meio de pcr negativo?)',
 'com sinais de alarme: suspeita de dengue com febre persistente há > 10 dias e alteração no hepatograma',
 'com sinais de alarme - investigação diagnóstica  para diferencial dengue x covid',
 'com sinais de alarme - investigação de suspeita de dengue',
 'com sinais de alarme - piora laboratorial + manutenção dos sintomas',
 'com sinais de alarme - suspeita de dengue e necessidade de investigação diagnóstica.',
 'com sinais de alarme - dengue com sintomas que limitam a avaliação por telemedicina ',
 'com sinais de alarme (hipotensão postural) -> dengue grupo c',
 'suspeita de dengue com sinais de alarme (hipotensão postural e petéquias)',
 'com sinais de alarme = plaquetopenia (69.000) + portadora de comorbidades (dm)',
 'com sinais de alarme: d6, com prostração e sonolência 

*b) loading:*

In [19]:
# perdi esses dados

# df_associated = pd.read_csv('./data/associated_redflag_data.csv') 

Making it available for algorithm calibration:

In [18]:
# perdi esses dados

# statframe.add_data(
#     contents = df_associated.terms.str.lower().to_list(),
#     protocol = 'DENGUE'
# )

*c) removing data to be used in algorithm calibration:*

In [29]:
df_examples = df_examples\
    .query(f'~clinical_impression.str.lower().str.contains("{text_flag}")', engine='python')

Experiment dataset:

In [30]:
df_experiment = df_examples[['CID', 'clinical_impression']].append(
    df_counterexamples[['CID', 'clinical_impression']]
).reset_index(drop=True)

  df_experiment = df_examples[['CID', 'clinical_impression']].append(


In [31]:
df_experiment

Unnamed: 0,CID,clinical_impression
0,A90,Paciente com quadro importante de dor no corpo...
1,A90,Paciente com quadro suspeito de dengue. Necess...
2,A90,Quadro clinico sugestivo de arbovirose (Dengue...
3,A90,Dengue? Chikungunya? Paciente refere quadro de...
4,A90,Quadro clínico sugestivo de arbovirose: Dengue...
...,...,...
10117,R51,Paciente com crise de enxaqueca sem melhora co...
10118,N30,sinal de alerta: indicação de exame complementar
10119,H00,Hórdeolo sem melhora com compressa. Necessidad...
10120,J06,Necessita de avaliação presencial para exame f...


**Experiment**

In [32]:
algorithms_time = list()

*a) `simple_scorer` algorithm:*

In [33]:
start_time = time.time()

df_experiment['simple_scorer'] = df_experiment\
    .apply(
        lambda x: statframe.get_adherence(
            x = x['clinical_impression'],
            cid = x['CID'],
            method = 0
        ),
    axis=1
)

end_time = time.time()

In [34]:
simplescorer_time = end_time - start_time

In [35]:
algorithms_time.append(('simple_scorer', simplescorer_time))

*b) `levenshtein_scorer` algorithm:*

In [36]:
start_time = time.time()

df_experiment['levenshtein_scorer'] = df_experiment\
    .apply(
        lambda x: statframe.get_adherence(
            x = x['clinical_impression'],
            cid = x['CID'],
            method = 1
        ),
    axis=1
)

end_time = time.time()

In [37]:
levenshteinscorer_time = end_time - start_time

In [38]:
algorithms_time.append(('levenshtein_scorer', levenshteinscorer_time))

*c) `jaccard_scorer` algorithm:*

In [39]:
start_time = time.time()

df_experiment['jaccard_scorer'] = df_experiment\
    .apply(
        lambda x: statframe.get_adherence(
            x = x['clinical_impression'],
            cid = x['CID'],
            method = 2
        ),
    axis=1
)

end_time = time.time()

In [40]:
jaccardscorer_time = end_time - start_time

In [41]:
algorithms_time.append(('jaccard_scorer', jaccardscorer_time))

*d) `bow_scorer` algorithm:*

In [42]:
start_time = time.time()

df_experiment['bow_scorer'] = df_experiment\
    .apply(
        lambda x: statframe.get_adherence(
            x = x['clinical_impression'],
            cid = x['CID'],
            method = 3
        ),
    axis=1
)

end_time = time.time()

In [43]:
bowscorer_time = end_time - start_time

In [44]:
algorithms_time.append(('bow_scorer', bowscorer_time))

*e) `bow_scorer` algorithm:*

In [45]:
# TODO

*f) `sentence_transformer_scorer` algorithm:*

In [46]:
# TODO

**Persistence of experiment results**

In [47]:
df_experiment.to_csv('./results/df_experiment_with_results_20231024.csv')

In [48]:
import pickle

with open('./results/algorithms_time_20231024.pkl', 'wb') as handler:

    pickle.dump(algorithms_time, handler)