# Loading and Processing the CMED data

In [11]:
import etl_functions as etl

df_cmed = etl.load_cmed(path = "data/cmed/cmed_clean_03_2024.csv", preprocess = True)

Preprocessing CMED


29866it [00:14, 2103.97it/s]


In [12]:
df_cmed

Unnamed: 0,principio_ativo,codigo ggrem,registro,ean_1,ean_2,ean_3,produto,apresentacao
0,dexametasona clotrimazol,538912020009303,1705600230032,7891106000956,-,-,BAYCUTEN N,10 mg g 0 443 mg g crem derm ct bg al x 40 g
1,abatacepte,505107701157215,1018003900019,7896016806469,-,-,ORENCIA,250 mg po liof sol inj ct 1 fa ser descartavel
2,abatacepte,505113030019605,1018003900027,7896016807442,-,-,ORENCIA,125 mg ml sol inj sc ct ser preenc
3,abatacepte,505113100020505,1018003900078,7896016808197,-,-,ORENCIA,125 mg ml sol inj sc ct 4 ser preenc vd trans ...
4,abciximabe,514517110034217,1123634150015,7896212452453,-,-,REOPRO,2 mg ml sol inj ct fa vd inc x 5 ml
...,...,...,...,...,...,...,...,...
29861,magnesio simeticona al,508011804138416,1004306960107,7891317469610,7891317020118,-,SIMECO PLUS,120 mg ml 60 mg ml 7 mg ml sus or ct fr vd amb...
29862,zinco,533507401164427,1039201400023,7898049793914,-,-,VITAGLÓS,5500 ui g 990 ui g 150 mg g pom ct bg al x 45 g
29863,zinco,533507402160425,1039201400031,7898049791309,-,-,VITAGLÓS,5500 ui g 990 ui g 150 mg g pom cx 50 bg al x ...
29864,zinco retinol colecalciferol,528526201160419,1256801730019,7898148295500,-,-,PRATIGLÓS,5000 ui g 900 ui g 150 mg g pom cx 50 bg al x ...


In [13]:
grouped_cmed = etl.grouped_cmed(df_cmed)

Creation of the grouped-cmed DataFrame


100%|██████████| 2140/2140 [00:04<00:00, 527.17it/s]


Dealing with duplicated pharmaceutical ingredients


100%|██████████| 57/57 [00:00<00:00, 2323.56it/s]


In [14]:
import ir_med

cmed_ai_words, cmed_pr_words = ir_med.extract_cmed_words(df_cmed)

29866it [00:01, 28996.68it/s]


In [15]:
grouped_cmed

Unnamed: 0,key,key_sorted,indexes
0,abacavir,abacavir,"[27317, 27318]"
1,abatacepte,abatacepte,"[1, 2, 3]"
2,abciximabe,abciximabe,"[4, 5]"
3,abemaciclibe,abemaciclibe,"[6, 7, 8, 9, 10, 11, 12, 13]"
4,abiraterona,abiraterona,"[121, 122, 123, 124, 125, 126, 127, 128, 129, ..."
...,...,...,...
2067,zoledronico,zoledronico,"[29700, 29701, 29702, 29703, 29704, 29705, 297..."
2068,zolmitriptana,zolmitriptana,"[29378, 29379]"
2069,zolpidem,zolpidem,"[16912, 16913, 16914, 16915, 16916, 16917, 169..."
2070,zopiclona,zopiclona,[29380]


In [16]:
cmed_ai_words

array(['abacavir', 'abatacepte', 'abciximabe', ..., 'zopiclona', 'zoster',
       'zuclopentixol'], dtype='<U30')

In [17]:
cmed_pr_words

array(['0', '00', '000', ..., 'xamp', 'xpe', 'xpect'], dtype='<U16')

# Load the list of medicines extracted from a public notice

In [18]:
df_notice = etl.load_notice('data/notices/1099_2023_25.csv', drop_columns = ['item', 'valor_total'],
                                 desc_column = 0, und_column = 1, sep = ',', preprocess = True)

Pré-processamento do edital


199it [00:00, 2213.90it/s]


In [19]:
df_notice

Unnamed: 0,desc,und,quant,valor_unit,original_desc,cmed_indexes
0,aciclovir 200 mg,com 200 mg,145020,0.28,Aciclovir (200mg),
1,aciclovir 50 mg g 5,crem derm bg 10 g,24996,3.11,Aciclovir 50 mg/g(5%),
2,acido acetilsalicilico 100 mg,com,6000000,0.05,Ácido Acetilsalicílico100 mg,
3,acido folico 15 mg,com,3000,2.09,Ácido folínico 15 mg,
4,acido valproico 50 mg ml,xpe fr 100 ml,12000,5.45,Ácido valpróico 50mg/mL,
...,...,...,...,...,...,...
194,valproato de sodio 250 mg,com,120000,0.52,Valproato de Sódio250 mg,
195,valproato de sodio 500 mg,com,1200000,1.52,Valproato de Sódio500 mg,
196,valproato de sodio 50 mg ml,xpe,12000,3.52,Valproato de Sódio50 mg/ml,
197,varfarina 5 mg,com,24000,0.13,Varfarina 5 mg,


# Ir-med execution

In [20]:
from tqdm import tqdm

SAVE_PROCESS_METADATA = False

for idx_X, row_X in tqdm(df_notice.iterrows()):

    desc_ai, desc_pr = ir_med.sep_desc(row_X['desc'], cmed_ai_words, cmed_pr_words)

    # Função de classificação
    df_notice.at[idx_X, 'cmed_indexes'], process_metadata = ir_med.predict(df_cmed, grouped_cmed, desc_ai, desc_pr, row_X['und'])

    if SAVE_PROCESS_METADATA:
        for key, value in process_metadata.items():
            df_notice.at[idx_X, key] = value

0it [00:00, ?it/s]

199it [00:10, 18.21it/s]


In [21]:
df_notice

Unnamed: 0,desc,und,quant,valor_unit,original_desc,cmed_indexes
0,aciclovir 200 mg,com 200 mg,145020,0.28,Aciclovir (200mg),"[532, 533, 535, 537, 540, 542, 544, 547, 549, ..."
1,aciclovir 50 mg g 5,crem derm bg 10 g,24996,3.11,Aciclovir 50 mg/g(5%),"[534, 539, 548, 550, 556, 561, 563, 565, 566, ..."
2,acido acetilsalicilico 100 mg,com,6000000,0.05,Ácido Acetilsalicílico100 mg,"[665, 666, 667, 668, 669, 670, 671, 672, 29384..."
3,acido folico 15 mg,com,3000,2.09,Ácido folínico 15 mg,[29551]
4,acido valproico 50 mg ml,xpe fr 100 ml,12000,5.45,Ácido valpróico 50mg/mL,[29691]
...,...,...,...,...,...,...
194,valproato de sodio 250 mg,com,120000,0.52,Valproato de Sódio250 mg,"[13411, 13412, 13415, 13418, 13419, 13420, 134..."
195,valproato de sodio 500 mg,com,1200000,1.52,Valproato de Sódio500 mg,"[13413, 13414, 13416, 13417, 13422, 13423, 134..."
196,valproato de sodio 50 mg ml,xpe,12000,3.52,Valproato de Sódio50 mg/ml,"[28973, 28974, 28977, 28980, 28981, 28983, 289..."
197,varfarina 5 mg,com,24000,0.13,Varfarina 5 mg,"[29214, 29215, 29216, 29217, 29218, 29219, 292..."


# Verificação de resultados

## Match PR

In [22]:
df_X

NameError: name 'df_X' is not defined

In [None]:
from nltk.tokenize import word_tokenize

right_pr = []

for (_, row) in tqdm(df_X.iterrows()):

    check = True

    # Verificar se todas as palavras presentes na coluna 'nome_grupo' estão na coluna 'desc'
    tokens_pr_encontrado = word_tokenize(row['pr_encontrado'])
    desc = row['desc']

    for tok in tokens_pr_encontrado:
        if tok not in desc:
        #     desc.replace(tok, "", 1)
        # else:
            check = False
            break
    
    # Verificar se todas as palavras presentes na coluna 'string_pr' estão na coluna 'nome_grupo'
    tokens_spr = word_tokenize(row['string_pr'])
    pr_encontrado = row['pr_encontrado']

    for tok in tokens_spr:
        if tok not in pr_encontrado:
        #     med.replace(tok, "", 1)
        # else:
            check = False
            break

    right_pr.append(check)

77it [00:00, 2589.72it/s]


In [None]:
right_apr = []
coms_toks = []
coms_string = []

for (idx, row) in tqdm(df_X.iterrows()):

    check = True
    count = 0

    ### Coletar os tokens comuns a TODOS os medicamentos encontrados

    aprs_eans_col = ""
    eans_coletados = df_cmed.iloc[df_X['medicamentos'][idx]].reset_index()
    common_tokens = word_tokenize(eans_coletados['apresentacao'][0])
    
    for idx in range(len(eans_coletados['apresentacao'])):
        if idx != 0:
            new_tokens = word_tokenize(eans_coletados['apresentacao'][idx])

            removable_tokens = []
            for tok in common_tokens:
                if tok not in new_tokens:
                    removable_tokens.append(tok)

            for tok in removable_tokens:
                common_tokens.remove(tok)

        # String creation
        aprs_eans_col += eans_coletados['apresentacao'][idx] + "\n"

    ### Verificar se todos os tokens de string_apr estão presentes em TODAS as entradas da CMED trazidas

    tokens_string_apr = word_tokenize(row['string_apr'])

    for tok in tokens_string_apr:
        if tok not in common_tokens:
            count += 1
            # check = False

    ### Verificar se todos os tokens de und estão presentes em TODAS as entradas da CMED trazidas

    tokens_string_und = word_tokenize(row['und'])

    for tok in tokens_string_und:
        if tok not in common_tokens:
            count += 1
            # check = False

    # Limiar
    if count > 1:
        check = False

    right_apr.append(check)
    coms_toks.append(" ".join(common_tokens))
    coms_string.append(aprs_eans_col[0:-1])


0it [00:00, ?it/s]

77it [00:00, 379.06it/s]


In [None]:
df_X['common_tokens_apr'] = coms_toks
df_X['common_tokens_apr_string'] = coms_string
df_X['right_pr'] = right_pr
df_X['right_apr'] = right_apr

In [None]:
df_X

Unnamed: 0,desc,und,quant,valor_unit,desc_orig,medicamentos,quant_matched,quant_grupo,perc_red_conj,string_pr,pr_encontrado,string_apr,common_tokens_apr,common_tokens_apr_string,right_pr,right_apr
0,acido ascorbico concentracao dosagem 200 mg ml...,fr 20 ml,198500,"R$ 1,34",ÁCIDO ASCÓRBICO concentração/dosagem 200 mg/mL...,"[29442, 29444, 29452, 29462, 29468, 29471, 294...",8.0,63.0,0.873016,ascorbico,ascorbico,200 mg ml sol or got,200 mg ml sol or fr got x 20 ml,200 mg ml sol or ct fr plas got x 20 ml\n200 m...,True,True
1,acido folico concentracao dosagem 0 2 mg ml fo...,fr 30 ml,213400,"R$ 4,30","ÁCIDO FÓLICO concentração/dosagem 0,2 mg/mL,...","[29550, 29553]",2.0,33.0,0.939394,folico,folico,0 2 mg ml sol or,0 2 mg ml sol or fr plas amb x 30 ml,0 2 mg ml sol or ct fr got plas amb x 30 ml\n0...,True,True
2,agua para injecao,amp 10 ml,2311400,"R$ 0,28",ÁGUA PARA INJEÇÃO,"[29752, 29826]",2.0,39.0,0.948718,agua injecao,agua injecao,,sol inj cx amp x 10 ml,sol inj cx 100 amp poliet x 10 ml\nsol inj cx ...,True,True
3,agu hipodermica material aco inoxidavel silico...,cx com 100 unidades,135900,"R$ 9,44","AGULHA HIPODÉRMICA, MATERIAL:AÇO INOXIDÁVEL SI...","[2178, 2179, 2182, 2183, 2184]",5.0,9.0,0.444444,x,axetil cefuroxima,agu hipodermica aco 21 g x 1 tipo conector em ...,mg com rev ct bl al al x,250 mg com rev ct bl al al x 10\n250 mg com re...,False,False
4,alcool etilico hidratado 70 deggl liq,fr com 1000 ml,200654,"R$ 5,39","ÁLCOOL ETÍLICO, HIDRATADO, 70%(70°GL), LÍQUIDO",[29831],1.0,1.0,0.000000,alcool,alcool polivinilico fenilefrina,70 liq,1 2 mg ml 14 0 mg ml sol oft ct fr plas opc go...,1 2 mg ml 14 0 mg ml sol oft ct fr plas opc go...,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,sulfato ferroso concentracao dosagem 40 mg de ...,com,6586400,"R$ 0,03",SULFATO FERROSO concentração/dosagem 40 mg ...,"[27751, 27753, 27754, 27757, 27758, 27767, 27768]",7.0,23.0,0.695652,ferroso ferro ii,ferroso,40 mg ii com or,40 mg com rev ct bl al plas x,40 mg com rev ct bl al plas pvc trans x 50\n40...,False,False
73,sulfato ferroso concentracao dosagem 5 mg ml f...,fr 100 ml,86420,"R$ 2,68","SULFATO FERROSO concentração/dosagem 5 mg/mL, ...","[27744, 27749, 27762, 27763, 27765, 27766]",6.0,23.0,0.739130,ferroso,ferroso,5 mg ml xpe or,mg ml xpe fr amb x 100 ml,10 mg ml xpe ct fr vd amb x 100 ml\n25 mg ml x...,True,False
74,timolol maleato concentracao dosagem 5 mg ml f...,fr 5 ml,23610,"R$ 2,83",TIMOLOL – MALEATO concentração/dosagem 5mg/mL...,"[20365, 20419, 20420, 20421, 20422, 20423, 204...",18.0,25.0,0.280000,timolol,timolol,5 mg ml,5 mg ml oft fr got plas x 5 ml,2 5 mg ml sol oft ct fr got plas opc x 5 ml\n5...,True,True
75,valproato de sodio ou acido valproico concentr...,cap ou com,2472100,"R$ 0,32",VALPROATO DE SÓDIO ou ÁCIDO VALPRÓICO conce...,"[29698, 29699]",2.0,8.0,0.750000,sodio valproico,valproico sodio,ou 288 mg a 250 cap com or,250 mg cap mole ct fr vd amb x,250 mg cap mole ct fr vd amb x 25\n250 mg cap ...,True,False


In [None]:
import pandas as pd

index = (pd.Series(right_pr) == True) & (pd.Series(right_apr) == True)
# index = [x if x else "" for x in index]
index = [x for x in index]

In [None]:
df_X['Hit'] = index

In [None]:
df_X \
    .drop(columns=['medicamentos'])  \
    [['Hit', 'quant_matched', 'quant_grupo', 'perc_red_conj', 'desc_orig', 'desc', 'string_pr', 'pr_encontrado', 'right_pr', 'und', 'string_apr', 'common_tokens_apr', 'common_tokens_apr_string', 'right_apr']] \
    .to_csv('results/resultados_classificador.csv', sep=';', decimal=',', index=False)