In [1]:
import sys
sys.path.append('..')

## Exemplo de uso da lib Extractor

In [2]:
from libs.extractor import Extractor as xtc

descricao = 'OXALIPLATINA 50 MG PÓ LIOFILIZADO FR/AMP X 500 MG'
xtc.extract(descricao)

('OXALIPLATINA', '50 MG', 'AMP', '500 MG')

## Config

### Imports

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

from libs.extractor import Extractor as xtc

pd.set_option('display.max_colwidth', None)

### Stopwords

In [4]:
# loading new_stopwords
with open('custom_stopwords.txt') as f:
    data = f.read()
new_stopwords = data.split()

In [5]:
# updating stopwords with new_stopwords
stopwords = set(stopwords.words('portuguese') + list(punctuation))
print(len(stopwords))

custom_stopwords = stopwords.copy()
custom_stopwords.update(new_stopwords)
print(len(custom_stopwords))

236
372


### Functions

In [6]:
def clean_desc(words, desc):
    subs = [sub.strip() for sub in re.split('‹|–|::|-|\|', desc)]  # split in substrings
    new_desc = ''
    
    for sub in subs:
        for word in words:
            if word in sub.split():
                new_desc += ' {}'.format(sub)
                break
                
    return new_desc.strip()  # return without extra initial space


def get_words(text):
    return [w for w in word_tokenize(text) if w.lower() not in custom_stopwords]


def remove_separator(terms):
    return [re.sub('\s|/', '', t) if t is not None else t for t in terms]

In [7]:
# clean_desc Example
desc = 'GLICOSE 5% CX C/40 FRASCOS X 250ML'
deriv = 'SORO DE GLICOSE 5% - EUROFARMA | DENTAL CREMER PRODUTOS'

print(desc)
print(clean_desc(get_words(desc), deriv))

GLICOSE 5% CX C/40 FRASCOS X 250ML
SORO DE GLICOSE 5%


In [8]:
# Extractor example
produto = 'ESPIRONOLACTONA 100MG CX C/30 CP'
principio_ativo, concentracao, forma_farmaceutica, quantidade = xtc.extract(produto)

print(' ---- '.join([principio_ativo, concentracao, forma_farmaceutica, quantidade]))

ESPIRONOLACTONA ---- 100MG ---- COM ---- 30


### Path

In [9]:
data_path = '../datasets/anvisa/augmented/'
data_prod = 'anvisa_produto_aumentado.csv'
data_pa = 'anvisa_principio_ativo_aumentado.csv'

## Parse

### Removendo ";" que não representa o separador do CSV

In [10]:
def custom_replace(desc):
    start = desc.find(';') + 1
    end = desc.rfind(';')
    return desc[:start] + desc[start:end].replace(';', ',') + desc[end:]


def fix_csv(src, target):
    buff = 300
    content = ''

    with open(src, 'r') as fs:
        with open(target, 'w') as ft:
            ft.write(fs.readline())  # header
            lines = fs.readlines(buff)
            while lines:
                for line in lines:
                    content += custom_replace(line)
                ft.write(content)
                content = ''
                lines = fs.readlines(buff)
                

# produto
src_file = '{}{}'.format(data_path, data_prod)
target_file = '{}{}_mod.csv'.format(data_path, data_prod[:-4])
fix_csv(src_file, target_file)

# principio_ativo
src_file = '{}{}'.format(data_path, data_pa)
target_file = '{}{}_mod.csv'.format(data_path, data_pa[:-4])
fix_csv(src_file, target_file)

## Pré-processamento PRODUTO

### Loading

In [11]:
# header => ['cod', 'descricao', 'ean']
data_file = 'anvisa_produto_aumentado_mod.csv'
src = '{}{}'.format(data_path, data_file)

df = pd.read_csv(src, dtype={0:int, 1:str, 2:str}, sep=';')
df.shape

(232770, 3)

### Removendo substrings com base nas palavras da descrição original

### Removendo registros com base nos termos principais

In [12]:
idxs = list()
removed = list()

# iterate over dataframe
for index, row in df.iterrows():
    if row['cod'] == 2:
        original_words = get_words(row['descricao'])  # get non stopwords
        _, conc, _, qtd = xtc.extract(row['descricao'])  # get principal terms
        conc, qtd = remove_separator([conc, qtd])
        master_idx = index
        continue
        
    new_desc = clean_desc(original_words, row['descricao'])  # remove irrelevant substrings
    if not new_desc:
        idxs.append(index)  # storage index for future drop
        removed.append([master_idx, index])
        continue
    
    _, new_conc, _, new_qtd = xtc.extract(new_desc)  # get principal terms
    new_conc, new_qtd = remove_separator([new_conc, new_qtd])
    
    if conc == new_conc and qtd == new_qtd:
        df.at[index,'descricao'] = new_desc  # replace descricao with cleaned new_desc
    else:
        idxs.append(index)  # storage index for future drop
        removed.append([master_idx, index])

### Removed

In [13]:
df_removed = pd.DataFrame(removed, columns=['master_idx', 'removed_idx'])
df_removed.head()

Unnamed: 0,master_idx,removed_idx
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5


In [14]:
df_grouped = df_removed.groupby('master_idx')['removed_idx'].apply(list).reset_index()
df_grouped.head()

Unnamed: 0,master_idx,removed_idx
0,0,"[1, 2, 3, 4, 5]"
1,6,"[7, 8, 9, 10, 11]"
2,12,"[13, 14]"
3,15,"[16, 17, 18, 19]"
4,20,[21]


In [15]:
master, removed = df_grouped.loc[3002].values
indexes = [master] + removed
df.loc[indexes][['cod', 'descricao']]

Unnamed: 0,cod,descricao
14829,2,NIMESULIDA 100 MG COM CT BL AL PLAS INC X 12
14833,4,NIMESULIDA 100MG GENÉRICO CIMED 12 COMPRIMIDOS - DROGARIA SAO
14834,4,"NIMESULIDA GEOLAB 100MG, CAIXA COM 12 COMPRIMIDOS | CR PRO"
14835,4,PREÇOS DE NIMESULIDA - MINHA VIDA
14836,4,REGISTRO ANVISA Nº 126750229 - NIMESULIDA - VÁLIDO - SMERP
14837,4,REGISTRO ANVISA Nº 1542302340028 - NIMESULIDA - VÁLIDO - SMERP


In [16]:
# DEBUG

descricao = 'NIMESULIDA 100 MG COM CT BL AL PLAS INC X 12'
derivado = 'NIMESULIDA GEOLAB 100MG, CAIXA COM 12 COMPRIMIDOS | CR PRO'

original_words = get_words(descricao)  # get non stopwords
_, conc, _, qtd = xtc.extract(descricao)  # get principal terms
conc, qtd = remove_separator([conc, qtd])
print(original_words)
print(conc, qtd, '\n')

new_desc = clean_desc(original_words, derivado)  # remove irrelevant substrings
_, new_conc, _, new_qtd = xtc.extract(new_desc)  # get principal terms
new_conc, new_qtd = remove_separator([new_conc, new_qtd])
print(new_desc)
print(new_conc, new_qtd, '\n')
    
if conc == new_conc and qtd == new_qtd:
    print('igual')

['NIMESULIDA', '100', 'MG', 'CT', 'BL', 'AL', 'PLAS', 'INC', 'X', '12']
100MG None 

NIMESULIDA GEOLAB 100MG, CAIXA COM 12 COMPRIMIDOS
100MG 12 



### Removing

In [17]:
df.drop(idxs, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.shape

(91990, 3)

#### Removendo registros  com os termos
* BULA
* PREÇO
* BARATO

In [18]:
cond1 = df['cod'] == 2
# or (
cond2 = df['cod'] == 4
# and
cond3 = ~df['descricao'].str.contains('BULA|PREÇO|BARATO', regex=True)
# )

df = df[cond1 | (cond2 & cond3)]
df.shape

(85445, 3)

#### Removendo os termos <font color='red'>(devem ser removidos nesta ordem)</font>
* EM MERCADO LIVRE BRASIL
* EM MERCADO LIVRE
* NO MERCADO LIVRE BRASIL
* NO MERCADO LIVRE
* COMPRAR EM ILHA DENTAL
* COMPRAR EM AGROFORTE
* COMPRAR EM FARMA PRATA
* ONDE COMPRAR (apenas no início do registro)
* COMPRAR
* ENCONTRE (apenas no início do registro)

#### Após as remoções acima, remover os registros com os termos
* ENCONTRE

In [19]:
pattern1 = r'(?i)(em|no)?\s+mercado\s+livre\s*(brasil)?'
pattern2 = r'(?i)comprar em (ilha dental|agroforte|farma prata)'
pattern3 = r'(?i)(onde)?\s*comprar'
pattern4 = r'(?i)^encontre'

replaces = {pattern1: '', 
            pattern2: '', 
            pattern3: '', 
            pattern4: '',}

df.replace(replaces, regex=True, inplace=True)

In [20]:
cond1 = df['cod'] == 2
cond2 = ~df['descricao'].str.contains('ENCONTRE')
df = df[cond1 | cond2]
df.shape

(85445, 3)

In [21]:
pattern = r'(?i)mercado livre|comprar|encontre'
df[df['descricao'].str.contains(pattern)]

Unnamed: 0,cod,descricao,ean


In [22]:
pattern = r'(?i)oferta'
df[df['descricao'].str.contains(pattern)].shape

(179, 3)

In [23]:
df.head()

Unnamed: 0,cod,descricao,ean
0,2,OSTENAN 10 MG COMP REV CX COM 30,7896226102634
6,2,OSTENAN 10 MG COMP REV CX COM 15,7896226102627
12,2,TEICOPLANINA 400 MG PÓ LIOFILIZADO SOL INJ CX C/ 1 FR/AMP X 3 M,7730711011233
15,2,AMBROTEN XPE ADU FR COM 100ML,7896226100449
20,2,OXALIPLATINA 50 MG PÓ LIOFILIZADO FR/AMP X 500 MG,7730711011493


In [24]:
# after removing, strip again
df['descricao'] = df['descricao'].str.strip()

In [25]:
start = 5000
end = start+20
cod = df['cod'].tolist()[start:end]
desc = df['descricao'].tolist()[start:end]
lista = list(zip(cod,desc))
for cod, desc in lista:
    if cod == 4:
        tab = '\t'
        br = ''
    else:
        tab = ''
        br = '\n'
        original_words = [w for w in desc if len(w) > 2]
    print('{}{}{}'.format(br,tab,desc))

	DICLOFENACO DIETILAMÔNIO 10MG/G 60G MEDLEY

EUPRESSIN 10 MG COM CT STR X 7

TRILEPTAL 60 MG/ML SUS OR CT FR VD AMB X 100 ML + 2 SER DOS
	TRILEPTAL 60MG/ML SUSPENSÃO ORAL FRASCO COM 100ML + 2 SERINGAS

TRILEPTAL 600 MG COM REV CT BL AL PLAS INC X 60

ZOMETA 4 MG SOL INJ CT  FA PLAS  INC X 5 ML

TRILEPTAL 600 MG COM REV CT BL AL PLAS INC X 10

VENORUTON 1000 MG COM EFERV CT 1 TB PLAS X 15
	VENORUTON 1000 MG COM EFERV CT 1 TB PLAS X 15

TRILEPTAL 300 MG COM REV CT BL AL PLAS INC X 10

TRILEPTAL 300 MG COM REV CT BL AL PLAS INC X 20

TONOPAN 0,5 MG + 40 MG + 125 MG DRG CT BL AL PLAS INC X 16

TOFRANIL 25 MG DRG CT BL AL PLAS INC X 20
	TOFRANIL 25 MG DRG CT BL AL PLAS INC X 20
	TOFRANIL 25 MG DRÁGEAS CT BL AL PLAS INC X 20

ZOLIBBS 0,8 MG/ML SOL INJ CT FA VD INC X 5 ML

RECONTER 10 MG COM REV CT BL AL PLAS INC X 30

RECONTER 10 MG COM REV CT BL AL PLAS INC X 10

RECONTER 20 MG COM REV CT BL AL PLAS INC X 30
	OXALATO DE ESCITALOPRAM 20 MG COM REV CT BL AL PLAS INC X 30


### Gravando em arquivo

In [26]:
data_prod = 'anvisa_produto_aumentado_preproc.csv'
df.to_csv('{}{}'.format(data_path, data_prod),
          sep=';',
          header=df.columns,
          index=False,
          encoding='utf-8')

## Pré-processamento PRINCÍPIO ATIVO

### Loading

In [27]:
# header => ['cod', 'descricao', 'ean']
data_pa = 'anvisa_principio_ativo_aumentado_mod.csv'
src = '{}{}'.format(data_path, data_pa)

df = pd.read_csv(src, dtype={0:int, 1:str, 2:str}, sep=';')
df.shape

(175341, 3)

### Removendo substrings com base nas palavras da descrição original

### Removendo registros com base nos termos principais

In [28]:
idxs = list()
removed = list()

# iterate over dataframe
for index, row in df.iterrows():
    if row['cod'] == 2:
        original_words = get_words(row['descricao'])  # get non stopwords
        _, conc, _, qtd = xtc.extract(row['descricao'])  # get principal terms
        conc, qtd = remove_separator([conc, qtd])
        master_idx = index
        continue
        
    new_desc = clean_desc(original_words, row['descricao'])  # remove irrelevant substrings
    if not new_desc:
        idxs.append(index)  # storage index for future drop
        removed.append([master_idx, index])
        continue
    
    _, new_conc, _, new_qtd = xtc.extract(new_desc)  # get principal terms
    new_conc, new_qtd = remove_separator([new_conc, new_qtd])
    
    if conc == new_conc and qtd == new_qtd:
        df.at[index,'descricao'] = new_desc  # replace descricao with cleaned new_desc
    else:
        idxs.append(index)  # storage index for future drop
        removed.append([master_idx, index])

### Removing

In [29]:
df.drop(idxs, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.shape

(80323, 3)

#### Removendo registros  com os termos
* BULA
* PREÇO
* BARATO

In [30]:
cond1 = df['cod'] == 2
# or (
cond2 = df['cod'] == 4
# and
cond3 = ~df['descricao'].str.contains('BULA|PREÇO|BARATO', regex=True)
# )

df = df[cond1 | (cond2 & cond3)]
df.shape

(78012, 3)

#### Removendo os termos <font color='red'>(devem ser removidos nesta ordem)</font>
* EM MERCADO LIVRE BRASIL
* EM MERCADO LIVRE
* NO MERCADO LIVRE BRASIL
* NO MERCADO LIVRE
* COMPRAR EM ILHA DENTAL
* COMPRAR EM AGROFORTE
* COMPRAR EM FARMA PRATA
* ONDE COMPRAR (apenas no início do registro)
* COMPRAR
* ENCONTRE (apenas no início do registro)

#### Após as remoções acima, remover os registros com os termos
* ENCONTRE

In [31]:
pattern1 = r'(?i)(em|no)?\s+mercado\s+livre\s*(brasil)?'
pattern2 = r'(?i)comprar em (ilha dental|agroforte|farma prata)'
pattern3 = r'(?i)(onde)?\s*comprar'
pattern4 = r'(?i)^encontre'

replaces = {pattern1: '', 
            pattern2: '', 
            pattern3: '', 
            pattern4: '',}

df.replace(replaces, regex=True, inplace=True)

In [32]:
cond1 = df['cod'] == 2
cond2 = ~df['descricao'].str.contains('ENCONTRE')
df = df[cond1 | cond2]
df.shape

(78012, 3)

In [33]:
# after removing, strip again
df['descricao'] = df['descricao'].str.strip()

### Gravando em arquivo

In [34]:
data_pa = 'anvisa_principio_ativo_aumentado_preproc.csv'
df.to_csv('{}{}'.format(data_path, data_pa),
          sep=';',
          header=df.columns,
          index=False,
          encoding='utf-8')