### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import oci 
import os
import base64
import ocifs
import PIL.Image as Image
import io
import re
import numpy as np 
import pandas as pd

config = oci.config.from_file('config')
ai_vision_client = oci.ai_vision.AIServiceVisionClient(config=config)

endpoint = "https://language.aiservice.us-ashburn-1.oci.oraclecloud.com"
ai_client = oci.ai_language.AIServiceLanguageClient(config, service_endpoint=endpoint)

fs = ocifs.OCIFileSystem()
object_storage_client = oci.object_storage.ObjectStorageClient(config)

img_list = fs.glob("oci://images_comprovantes_pagamento@id3kyspkytmr/*.jpg")

def valida_documento(ner_inference_result):
    lista_check_1 = ['Pagamento', 'Valor', 'Data']
    lista_check_2 = ['Pix', 'Valor', 'Data']
    lista_individual = []
    
    for j in range (0, len(ner_inference_result.data.documents[0].entities)):
        lista_individual.append(ner_inference_result.data.documents[0].entities[j].type)
                    
    validade_pagamento =  all(x in lista_individual for x in lista_check_1)
    validade_pix =  all(x in lista_individual for x in lista_check_2)
    
    diff = np.setdiff1d(lista_check_1, lista_individual)
    
    if validade_pagamento:
        validade = True
        diff = np.setdiff1d(lista_check_1, lista_individual)  
    
    elif validade_pix: 
        validade = True
        diff = np.setdiff1d(lista_check_2, lista_individual)
    
    else: 
        validade = False             
    
    return validade, diff

def formata_valor(valor):
     try:
        valor_tratado = re.findall('[0-9]+,[0-9]+.[0-9]+|[0-9]+.[0-9]+,[0-9]+|[0-9]+.[0-9]+|[0-9]+,[0-9]+', valor)
        valor_tratado = valor_tratado[0]
        
        return valor_tratado
    
     except:
        valor_tratado = '0'
        
        return valor_tratado

name_list = []
for i, item in enumerate(img_list):
    name = item.rsplit('/', 1)[1]
    name_list.append(name)
    
#code_list = []
#for i in range(0, len(text_list)):
    #codigo = text_list[i]
    #s = [float(s) for s in re.findall(r'-?\d+\.?\d*', codigo)]
    #code_list.append(int(s[1]))

In [2]:
dataset = pd.DataFrame(columns=['Código', 'Valor (R$)', 'Validade', 'Informação Faltante'])

In [3]:
for i, file in enumerate(img_list):
    try:
        codigo = name_list[i]

        valor = []
        data = []
        pagamento = []

        with fs.open(file) as f:
            content = f.read()

            encoded_string = base64.b64encode(content, altchars=None)
            decoded_string = encoded_string.decode("utf-8", "ignore")

            analyze_document_response = ai_vision_client.analyze_document(
                    analyze_document_details=oci.ai_vision.models.AnalyzeDocumentDetails(
                    compartment_id = "ocid1.compartment.oc1..aaaaaaaal63rmctoojg7q2pvdpeuqknebyaqg3h7gcci6whf74ht7tfapl4q",
                    features=[
                        oci.ai_vision.models.DocumentTextDetectionFeature(
                            feature_type="TEXT_DETECTION")],
                    language='ENG',
                    document=oci.ai_vision.models.InlineDocumentDetails(
                    source="INLINE",
                    data=decoded_string)))

            words = str([word.text for page in analyze_document_response.data.pages for word in page.words])
            name = f"{name_list[i]}.txt"  

            #words_str = words.decode()

            words_str_1 = words.replace("',",'')
            words_str_2 = words_str_1.replace("'",'')
            words_str_3 = words_str_2.replace("[",'')
            words_str_4 = words_str_3.replace("]",'')

        ner_text = oci.ai_language.models.BatchDetectLanguageEntitiesDetails(
        endpoint_id = "ocid1.ailanguageendpoint.oc1.iad.amaaaaaatsbrckqamwnoz5zpy3h2feyt4rjtwm5btthfyqx2e6k4uone3zqq",
        documents = [oci.ai_language.models.TextDocument(
            key = "1",
            text = words_str_4)])

        ner_inference_result = ai_client.batch_detect_language_entities(ner_text)

        validade, info_faltante = valida_documento(ner_inference_result)

        for j in range (0, len(ner_inference_result.data.documents[0].entities)):

            entidade = ner_inference_result.data.documents[0].entities[j].type
            texto = ner_inference_result.data.documents[0].entities[j].text

            if entidade == 'Valor':
                valor_formatado = formata_valor(texto)
                valor.append(valor_formatado)      

            elif entidade == 'Data':
                data.append(texto)

            elif entidade == 'Pagamento':
                pagamento.append(texto)

            elif entidade == 'Pix':
                pagamento.append(texto)

        dataset = dataset.append({'Código': codigo, 'Valor (R$)' : valor[0],  
                                'Validade' : validade, 'Informação Faltante' : info_faltante}, ignore_index=True)

    except:
        print(f'Amostra inválida, código {codigo}')
        
    #print(f"Dataset atualizado, amostra {codigo}")

Amostra inválida, código pagamento_106.jpg
Amostra inválida, código pagamento_111.jpg


In [4]:
dataset

Unnamed: 0,Código,Valor (R$),Validade,Informação Faltante
0,0001.jpg,4500,False,[Pagamento]
1,Comprovante_Light_Carioquinha_page-0001.jpg,"46.000,00",False,[Pagamento]
2,Comprovante_pagamento.jpg,"4.500,00",False,[Pagamento]
3,ComprovantedePix_page-0001.jpg,"23.483,49",True,[]
4,agendamento.jpg,9000,False,"[Data, Pagamento]"
...,...,...,...,...
157,pix_5.jpg,50000,True,[]
158,pix_6.jpg,40000,True,[]
159,pix_7.jpg,18935,True,[]
160,pix_8.jpg,80000,True,[]


In [5]:
dataset.to_csv('Comprovantes_Pagamento.csv', index=False)

In [8]:
put_object_response = object_storage_client.put_object(
    namespace_name="id3kyspkytmr",
    bucket_name="Comprovantes_Pagamento_csv",
    object_name='Dataset_comprovantes_pagamento',
    put_object_body = dataset.to_csv(),
    content_type="text/csv")