In [1]:
import sys
sys.path.append('..')

## Exemplo de uso da lib Extractor

In [2]:
from libs.extractor import Extractor as xtc

descricao = 'HYNALGIN DIPIRONA 500MG/ML INJETÁVEL 5ML C/100'
xtc.extract(descricao)

('HYNALGIN DIPIRONA', '500MG/ML', None, '5ML')

## Config

### Imports

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

from libs.extractor import Extractor as xtc

### Stopwords

In [4]:
# loading new_stopwords
with open('custom_stopwords.txt') as f:
    data = f.read()
new_stopwords = data.split()

In [5]:
# updating stopwords with new_stopwords
stopwords = set(stopwords.words('portuguese') + list(punctuation))
print(len(stopwords))

custom_stopwords = stopwords.copy()
custom_stopwords.update(new_stopwords)
print(len(custom_stopwords))

236
372


### Functions

In [6]:
def clean_desc(words, desc):
    subs = [sub.strip() for sub in re.split('‹|–|::|-|\|', desc)]  # split in substrings
    new_desc = ''
    
    for sub in subs:
        for word in words:
            if word in sub.split():
                new_desc += ' {}'.format(sub)
                break
                
    return new_desc.strip()  # return without extra initial space


def get_words(text):
    return [w for w in word_tokenize(text) if w.lower() not in custom_stopwords]


def remove_blank(terms):
    return [re.sub('\s', '', t) if t is not None else t for t in terms]

In [7]:
# clean_desc Example
desc = 'GLICOSE 5% CX C/40 FRASCOS X 250ML'
deriv = 'SORO DE GLICOSE 5% - EUROFARMA | DENTAL CREMER PRODUTOS'

print(desc)
print(clean_desc(get_words(desc), deriv))

GLICOSE 5% CX C/40 FRASCOS X 250ML
SORO DE GLICOSE 5%


In [8]:
# Extractor example
produto = 'ESPIRONOLACTONA 100MG CX C/30 CP'
principio_ativo, concentracao, forma_farmaceutica, quantidade = xtc.extract(produto)

print(' ---- '.join([principio_ativo, concentracao, forma_farmaceutica, quantidade]))

ESPIRONOLACTONA ---- 100MG ---- COM ---- 30


### Path

In [9]:
data_path = '../datasets/medicamentos/augmented/'
data_file = 'medicamentos_aumentado.csv'

## Parse

### Removendo ";" que não representa o separador do CSV

In [10]:
def custom_replace(desc):
    start = desc.find(';') + 1
    end = desc.rfind(';')
    return desc[:start] + desc[start:end].replace(';', ',') + desc[end:]

src = '{}{}'.format(data_path, data_file)
target = '{}{}_mod.csv'.format(data_path, data_file[:-4])

buff = 300
content = ''

with open(src, 'r') as fs:
    with open(target, 'w') as ft:
        ft.write(fs.readline())  # header
        lines = fs.readlines(buff)
        while lines:
            for line in lines:
                content += custom_replace(line)
            ft.write(content)
            content = ''
            lines = fs.readlines(buff)

## Pré-processamento

#### Loading

In [11]:
# header => ['cod', 'descricao', 'ean']
data_file = 'medicamentos_aumentado_mod.csv'
src = '{}{}'.format(data_path, data_file)

df = pd.read_csv(src, dtype={0:int, 1:str, 2:str}, sep=';')
df.shape

(340946, 3)

#### Removendo substrings com base nas palavras da descrição original

#### Removendo registros com base nos termos principais

In [12]:
idxs = list()
removed = list()

# iterate over dataframe
for index, row in df.iterrows():
    if row['cod'] == 1:
        original_words = get_words(row['descricao'])  # get non stopwords
        _, conc, _, qtd = xtc.extract(row['descricao'])  # get principal terms
        conc, qtd = remove_blank([conc, qtd])
        master_idx = index
        continue
        
    new_desc = clean_desc(original_words, row['descricao'])  # remove irrelevant substrings
    if not new_desc:
        idxs.append(index)  # storage index for future drop
        removed.append([master_idx, index])
        continue
    
    _, new_conc, _, new_qtd = xtc.extract(new_desc)  # get principal terms
    new_conc, new_qtd = remove_blank([new_conc, new_qtd])
    
    if conc == new_conc and qtd == new_qtd:
        df.at[index,'descricao'] = new_desc  # replace descricao with cleaned new_desc
    else:
        idxs.append(index)  # storage index for future drop
        removed.append([master_idx, index])

### Removed

In [13]:
df_removed = pd.DataFrame(removed, columns=['master_idx', 'removed_idx'])
df_removed.head()

Unnamed: 0,master_idx,removed_idx
0,0,1
1,0,3
2,0,4
3,28,29
4,28,30


In [14]:
df_grouped = df_removed.groupby('master_idx')['removed_idx'].apply(list).reset_index()
df_grouped.head()

Unnamed: 0,master_idx,removed_idx
0,0,"[1, 3, 4]"
1,28,"[29, 30, 31]"
2,32,"[33, 34, 35, 36, 37, 38]"
3,39,"[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]"
4,59,"[60, 62, 63, 64, 65, 66]"


In [15]:
pd.set_option('display.max_colwidth', -1)

master, removed = df_grouped.loc[4000].values
indexes = [master] + removed
df.loc[indexes][['cod', 'descricao']]

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,cod,descricao
31969,1,ISOSSORBIDA 5MG - COMPLT : 0L2883 VL: 31-03-2020
31970,3,ISORDIL SL 5MG CAIXA COM 30 COMPRIMIDOS SUBLINGUAIS | FARMÁCIA
31971,3,ISORDIL SUBLINGUAL 5MG 30 COMPRIMIDOS - ULTRAFARMA
31972,3,ISOSSORBIDA 5MG C/30 COMPRIMIDOS (ISORDIL) - EMS | HOSPITALAR
31973,3,MONOCORDIL 5MG 30 CP SUBLING - PANVEL FARMÁCIAS


In [16]:
# DEBUG

descricao = 'CLORIDRATO DE MEMANTINA 10MG C\60 C1'
derivado = 'CLORIDRATO DE MEMANTINA 10MG 60 COMPRIMIDOS EUROFARMA | DROGA'

original_words = get_words(descricao)  # get non stopwords
_, conc, _, qtd = xtc.extract(descricao)  # get principal terms
conc, qtd = remove_blank([conc, qtd])
print(original_words)
print(conc, qtd, '\n')

new_desc = clean_desc(original_words, derivado)  # remove irrelevant substrings
_, new_conc, _, new_qtd = xtc.extract(new_desc)  # get principal terms
new_conc, new_qtd = remove_blank([new_conc, new_qtd])
print(new_desc)
print(new_conc, new_qtd, '\n')
    
if conc == new_conc and qtd == new_qtd:
    print('igual')

['CLORIDRATO', 'MEMANTINA', '10MG', 'C0', 'C1']
10MG None 

CLORIDRATO DE MEMANTINA 10MG 60 COMPRIMIDOS EUROFARMA
10MG 60 



### Removing

In [17]:
df.drop(idxs, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.shape

(124210, 3)

#### Removendo registros  com os termos
* BULA
* PREÇO
* BARATO

In [18]:
cond1 = df['cod'] == 1
# or (
cond2 = df['cod'] == 3
# and
cond3 = ~df['descricao'].str.contains('BULA|PREÇO|BARATO', regex=True)
# )

df = df[cond1 | (cond2 & cond3)]
df.shape

(120233, 3)

#### Removendo os termos <font color='red'>(devem ser removidos nesta ordem)</font>
* EM MERCADO LIVRE BRASIL
* EM MERCADO LIVRE
* NO MERCADO LIVRE BRASIL
* NO MERCADO LIVRE
* COMPRAR EM ILHA DENTAL
* COMPRAR EM AGROFORTE
* COMPRAR EM FARMA PRATA
* ONDE COMPRAR (apenas no início do registro)
* COMPRAR
* ENCONTRE (apenas no início do registro)

#### Após as remoções acima, remover os registros com os termos
* ENCONTRE

In [19]:
pattern1 = r'(?i)(em|no)?\s+mercado\s+livre\s*(brasil)?'
pattern2 = r'(?i)comprar em (ilha dental|agroforte|farma prata)'
pattern3 = r'(?i)(onde)?\s*comprar'
pattern4 = r'(?i)^encontre'

replaces = {pattern1: '', 
            pattern2: '', 
            pattern3: '', 
            pattern4: '',}

df.replace(replaces, regex=True, inplace=True)

In [20]:
cond1 = df['cod'] == 1
cond2 = ~df['descricao'].str.contains('ENCONTRE')
df = df[cond1 | cond2]
df.shape

(120224, 3)

In [21]:
pattern = r'(?i)mercado livre|comprar|encontre'
df[df['descricao'].str.contains(pattern)]

Unnamed: 0,cod,descricao,ean


In [22]:
pattern = r'(?i)oferta'
df[df['descricao'].str.contains(pattern)].shape

(1399, 3)

In [23]:
df.head()

Unnamed: 0,cod,descricao,ean
0,1,GEL P/ ECG 100GR,7898107240268
2,3,GEL ECG (ELETROCARDIOGRAMA) FRASCO 100G,7898107240268
5,3,GEL PARA ECG AZUL 100 GRAMAS CARBOGEL GEL CONDUTOR,7898107240268
6,3,GEL PARA ECG FRASCO 100GR GEL ECG CARBOGEL,7898107240268
7,3,GEL PARA ELETROCARDIOGRAMA (ECG) 100G,7898107240268


In [24]:
# after removing, strip again
df['descricao'] = df['descricao'].str.strip()

In [25]:
start = 5056
end = start+20
cod = df['cod'].tolist()[start:end]
desc = df['descricao'].tolist()[start:end]
lista = list(zip(cod,desc))
for cod, desc in lista:
    if cod == 3:
        tab = '\t'
        br = ''
    else:
        tab = ''
        br = '\n'
        original_words = [w for w in desc if len(w) > 2]
    print('{}{}{}'.format(br,tab,desc))


GARDENAL 50MG C/20CP##(6928)
	GARDENAL 50MG, CAIXA COM 20 COMPRIMIDOS
	GARDENAL 50MG 20 COMPRIMIDOS (C1)
	GARDENAL 50MG 20 COMPRIMIDOS SANOFI AVENTIS
	GARDENAL 50MG C/ 20 COMPRIMIDO
	GARDENAL 50MG 20CP

CLOR DE NORTRIPTILINA 25MG C/30CP (04775)

ESOMEPRAZOL 40MG C/28CP (15049)
	ESOMEPRAZOL 40MG 28 COMPRIMIDOS
	ESOMEPRAZOL 40MG 28 CP EMS GENÉRICO

ETNA 2.5MG C/20CP (2607)

TARTAR DE METOPROLOL 100MG C/30CP (01818)
	SUCCINATO DE METOPROLOL ASTRAZENECA 100MG 30 COMPRIMIDOS

OXCARBAZEPINA 600MG C/30CP(05367)
	OXCARBAZEPINA 600MG COM 30 COMPRIMIDOS GENÉRICO
	OXCARBAZEPINA 600MG 30 COMPRIMIDOS MEDLEY
	OXCARBAZEPINA 600MG 30 COMPRIMIDOS REVESTIDOS MEDLEY
	OXCARBAZEPINA 600MG 30 CP (C1)
	OXCARBAZEPINA RANBAXY 600MG, CAIXA COM 30 COMPRIMIDOS

OXCARBAZEPINA 300MG C/30CP(4132)


### Gravando em arquivo

In [26]:
data_file = 'medicamentos_aumentado_preproc.csv'
df.to_csv('{}{}'.format(data_path, data_file),
          sep=';',
          header=df.columns,
          index=False,
          encoding='utf-8')