# Unit test

In [None]:
import datetime
from dotenv import load_dotenv
load_dotenv(override=True)

from ai_ta_backend.utils.types import DocumentMetadata

doc = DocumentMetadata(
    authors=['Author One', 'Author Two'],
    journal_name='Journal Name',
    publication_date=datetime.date.today(),
    keywords=['keyword1', 'keyword2'],
    doi='12345',
    title='Title',
    subtitle='Subtitle',
    visible_urls=[],
    field_of_science='Field of Science',
    concise_summary='Summary',
    specific_questions_document_can_answer=['Question 1', 'Question 2'],

    # ! TODO: Put all dynamic fileds here. Like section titles. 
    additional_fields={
        'additional_field1': 'Additional Value 1',  # Arbitrary additional field
        'additional_field2': 'Additional Value 2',  # Another arbitrary additional field
    }
)

from SQLite import insert_doc
insert_doc(doc, commit_on_change=True)

print(doc.dict())

In [None]:
fields = list(DocumentMetadata.schema()["properties"].keys())
fields

# Marvin test

In [None]:
import datetime
from dotenv import load_dotenv
load_dotenv(override=True)

from ai_ta_backend.utils.types import DocumentMetadata


In [None]:
# Random test paper copy/pasted from from ArXiv: https://arxiv.org/html/2404.09995v1
raw_text = """License: CC BY-NC-ND 4.0
arXiv:2404.09995v1 [cs.CV] 15 Apr 2024
(eccv) Package eccv Warning: Package 'hyperref' is loaded with option 'pagebackref', which is *not* recommended for camera-ready version

1
Taming Latent Diffusion Model for Neural Radiance Field Inpainting
Chieh Hubert Lin
1122
Changil Kim
11
Jia-Bin Huang
1133
Qinbo Li
11
Chih Yao Ma
11
Johannes Kopf
11
Ming-Hsuan Yang
22
Hung-Yu Tseng
11
Abstract
Neural Radiance Field (NeRF) is a representation for 3D reconstruction from multi-view images. Despite some recent work showing preliminary success in editing a reconstructed NeRF with diffusion prior, they remain struggling to synthesize reasonable geometry in completely uncovered regions. One major reason is the high diversity of synthetic contents from the diffusion model, which hinders the radiance field from converging to a crisp and deterministic geometry. Moreover, applying latent diffusion models on real data often yields a textural shift incoherent to the image condition due to auto-encoding errors. These two problems are further reinforced with the use of pixel-distance losses. To address these issues, we propose tempering the diffusion model's stochasticity with per-scene customization and mitigating the textural shift with masked adversarial training. During the analyses, we also found the commonly used pixel and perceptual losses are harmful in the NeRF inpainting task. Through rigorous experiments, our framework yields state-of-the-art NeRF inpainting results on various real-world scenes.

Refer to caption
Figure 1:NeRF inpainting. Given a set of posed images associated with inpainting masks, the proposed framework estimates a NeRF that renders high-quality novel views, where the inpainting region is realistic and contains high-frequency details.
1Introduction
The recent advancements in neural radiance fields (NeRF) [24, 27, 3] have achieved high-quality 3D reconstruction and novel-view synthesis of scenes captured with a collection of images. The success intrigues an increasing attention on manipulating NeRFs such as 3D scene stylization [38, 8] and NeRF editing [13]. In this work, we focus on the NeRF inpainting problem. As shown in Figure 1, given a set of images of a scene with the inpainting masks, our goal is to estimate a completed NeRF that renders high-quality images at novel viewpoints. The NeRF inpainting task enables a variety of 3D content creation applications such as removing objects from a scene [26, 39], completing non-observed part of the scene, and hallucinating contents in the designated regions.
"""

In [None]:
import marvin
res = marvin.extract(raw_text, target=DocumentMetadata)
res

In [None]:
doc = res[0]
from SQLite import insert_doc
insert_doc(doc, commit_on_change=True)

# LLM Parsing with vLLM

In [None]:
### LLM PARSING
from openai import OpenAI # pip install openai>=1.0

client = OpenAI(
    api_key="irrelevant", # any non-empty string
    base_url = "https://api.ncsa.ai/llm/v1" ## 👈 ONLY CODE CHANGE ##
)

completion = client.chat.completions.create(
    model="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", # way better than mistral instruct v0.2. Works great! As good as GPT-4 so far.
    messages=[
        # {"role": "system", "content": "You are an expert at categorizing scientific papers. Please categorize the following paper."},
        {"role": "user", "content": "You are an expert at categorizing scientific papers. Please categorize the following paper.\n" + raw_text},
    ],
    extra_body={"guided_json": DocumentMetadata.schema()},
    temperature=0.2,
    stream=True,
)

# ⚡️⚡️ streaming 
final = ""
for chunk in completion:
    print(chunk.choices[0].delta.content or "", end="")
    final += chunk.choices[0].delta.content or ""

# print(completion.choices[0].message.content)

In [None]:
doc = DocumentMetadata.parse_raw(final)
doc.dict()

In [None]:
from SQLite import insert_doc
insert_doc(doc, commit_on_change=True)

In [None]:
import json
from pathlib import Path

# Load the JSON data
filepath = Path("/Users/kastanday/code/ncsa/ai-ta/ai-experiments/s2orc-doc2json/output_dir/N18-3011.json")
with open(filepath) as file:
    data = json.load(file)

# Initialize a dictionary to store the grouped text
grouped_text = {}

# Iterate over each item in the "body_text" array
# for entry in data
for item in data["pdf_parse"]["body_text"]:
    text = item['text']
    try: 
        sec_num = item['sec_num'].split('.')[0]  # Extract the major section number
    except Exception as e:
        print(e)
        continue
    
    # Append the text to the corresponding major section
    if sec_num in grouped_text:
        grouped_text[sec_num] += ' ' + text
    else:
        grouped_text[sec_num] = text

# Create the output JSON object
output = []
for sec_num, all_text in grouped_text.items():
    output.append({
        'all_text': all_text,
        'major_sec_num': sec_num
    })

# Save the output JSON object to a file
with open('output.json', 'w') as file:
    json.dump(output, file, indent=2)

# Download PDFs

In [None]:
from concurrent.futures import ThreadPoolExecutor
import os
from minio import Minio

from dotenv import load_dotenv
load_dotenv(override=True)

client = Minio(os.environ['MINIO_URL'],
    # access_key=os.environ['MINIO_ACCESS_KEY'],
    # secret_key=os.environ['MINIO_SECRET_KEY'],
    # secure=True
)

bucket_name = 'science'

# List objects in the bucket
objects = client.list_objects(bucket_name, recursive=True)

# Define a function to download an object
def download_object(obj, index):
    try:
        client.fget_object(bucket_name, obj.object_name, 'science-pdfs/' + obj.object_name)
        print(f"✅ Downloaded {obj.object_name} from bucket {bucket_name}")
    except Exception as e:
        print(f"❌ {e}... Error downloading {obj.object_name} from bucket {bucket_name}")

# Use ThreadPoolExecutor to parallelize downloads
with ThreadPoolExecutor(max_workers=10) as executor:
    for i, obj in enumerate(objects):
        if i > 500:
            break
        executor.submit(download_object, obj, i)

# Boto3 version of the same download

In [None]:
import boto3
from botocore.exceptions import NoCredentialsError, ClientError

# Configure the boto3 client
s3_client = boto3.client(
    's3',
    endpoint_url="https://minio-api.kastan.ai",  # Assuming the MINIO_URL is like 'https://play.min.io'
    aws_access_key_id=os.environ['MINIO_ACCESS_KEY'],
    aws_secret_access_key=os.environ['MINIO_SECRET_KEY'],
    # region_name='us-east-1',  # Adjust the region if necessary
    # config=boto3.session.Config(signature_version='s3v4'),
    use_ssl=True  # Set to False if your MinIO isn't using SSL
)

def download_object_boto3(bucket_name, object_name, save_as):
    try:
        s3_client.download_file(bucket_name, object_name, save_as)
        print(f"✅ Downloaded {object_name} from bucket {bucket_name}")
    except (NoCredentialsError, ClientError) as e:
        print(f"❌ Error downloading {object_name} from bucket {bucket_name}: {e}")

# Example usage within your existing setup
bucket_name = 'science'
objects = s3_client.list_objects_v2(Bucket=bucket_name)['Contents']

# Download the first few objects for demonstration
for obj in objects[:10]:  # Limit to the first 10 objects for example
    file_path = 'science-pdfs/' + obj['Key']
    download_object_boto3(bucket_name, obj['Key'], file_path)
