### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [10]:
import oci
import uuid
import json
import os
import base64

In [11]:
CONFIG_PROFILE = "DEFAULT"
config = oci.config.from_file('config', CONFIG_PROFILE)

compartment_id = os.environ["NB_SESSION_COMPARTMENT_OCID"]

In [12]:
def call_OCR(path):
    
    with open(path, 'rb') as file:
        content = file.read()

    encoded_string = base64.b64encode(content, altchars=None)
    key_value_detection_sample_string = encoded_string.decode("utf-8", "ignore")
    
    aiservicedocument_client = oci.ai_document.AIServiceDocumentClientCompositeOperations(oci.ai_document.AIServiceDocumentClient(config=config))

    # Text extraction feature
    text_extraction_feature = oci.ai_document.models.DocumentTextExtractionFeature()

    # Setup the output location where processor job results will be created
    output_location = oci.ai_document.models.OutputLocation()
    output_location.namespace_name = '<namespace>'
    output_location.bucket_name = '<bucket_name>'
    output_location.prefix = "<prefix_output>"

    # Create a processor_job for text_extraction feature
    create_processor_job_details_text_extraction = oci.ai_document.models.CreateProcessorJobDetails(
                                                        display_name=str(uuid.uuid4()),
                                                        compartment_id=compartment_id,
                                                        input_location=oci.ai_document.models.InlineDocumentContent(data=encoded_string.decode('utf-8')),
                                                        output_location=output_location,
                                                        processor_config=oci.ai_document.models.GeneralProcessorConfig(language="POR",
                                                                                                                        features=[text_extraction_feature]))
    def create_processor_job_callback(times_called, response):
        pass
    
    #print("Calling create_processor with create_processor_job_details_text_extraction:", create_processor_job_details_text_extraction)
    create_processor_response = aiservicedocument_client.create_processor_job_and_wait_for_state(
        create_processor_job_details=create_processor_job_details_text_extraction,
        wait_for_states=[oci.ai_document.models.ProcessorJob.LIFECYCLE_STATE_SUCCEEDED],
        waiter_kwargs={"wait_callback": create_processor_job_callback})

    processor_job: oci.ai_document.models.ProcessorJob = create_processor_response.data
    #print("create_processor_job_details_text_extraction response: ", create_processor_response.data)

    #print("Getting result json from the output_location")
    object_storage_client = oci.object_storage.ObjectStorageClient(config=config)
    get_object_response = object_storage_client.get_object(namespace_name=output_location.namespace_name,
                                                           bucket_name=output_location.bucket_name,
                                                           object_name="{}/{}/_/results/defaultObject.json".format(
                                                               output_location.prefix, processor_job.id,
                                                               ))

    response = get_object_response.data.content.decode()

    json_response = json.loads(response)
    
    try:
        return json_response
    except:
        return []

In [39]:
res = call_OCR('<file_path>')

In [40]:
extracted_text = ''

for i in range(len(res['pages'][0]['lines'])):
    extracted_text += f"{res['pages'][0]['lines'][i]['text']} \n"

In [21]:
pergunta = '''Por favor retorne as segintes informações em bullet points:
<campo_1>, <campo_2>, <campo_3>, ...
'''

In [44]:
prompt = f'''
Você é um assistente que irá responder dúvidas pontuais sobre o texto extraído de um documento.
Por favor, responda sempre baseado apenas no contexto fornecido e em Português do Brasil pt-br.

Contexto: {extracted_text};
Pergunta: {pergunta};

Por favor responda com muita atenção, pois isto é muito importante para minha carreira!
'''