In [10]:
import nest_asyncio
nest_asyncio.apply()
from pathlib import Path
from datetime import datetime
import re

import sys
sys.path.append("../")

from typing import List
from desci_sense.shared_functions.init import init_multi_chain_parser_config
from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser
from desci_sense.evaluation.utils import get_dataset, obj_to_json, obj_str_to_dict
from desci_sense.shared_functions.dataloaders import (
    scrape_post,
    convert_text_to_ref_post,
)
from desci_sense.shared_functions.configs import (
    OpenrouterAPIConfig,
    WandbConfig,
    LLMConfig,
    KeywordPParserChainConfig,
    RefTaggerChainConfig,
    TopicsPParserChainConfig,
    validate_env_var,
    MultiParserChainConfig,
    ParserChainType,
    PostProcessType,
) 

In [9]:
from desci_sense.shared_functions.interface import (
    RDFTriplet,
    isAConceptDefintion,
    KeywordConceptDefinition,
    ParserSupport,
    ParserResult,
    OntologyInterface,
    ZoteroItemTypeDefinition,
    )
from rdflib.namespace import RDF
from rdflib import URIRef, Literal, Graph

In [3]:
TEST_POST_TEXT_W_REF = """
I really liked this paper!
https://arxiv.org/abs/2402.04607
"""

In [4]:
multi_config = init_multi_chain_parser_config(llm_type='google/gemma-7b-it:free',
                                        post_process_type="combined")
multi_config.post_process_type = PostProcessType.COMBINED
mcp = MultiChainParser(multi_config)
res = mcp.process_text(TEST_POST_TEXT_W_REF)

[32m2024-07-08 15:57:35.197[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m76[0m - [1mInitializing MultiChainParser. PostProcessType=combined[0m
[32m2024-07-08 15:57:35.199[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m83[0m - [1mInitializing post parsers...[0m
[32m2024-07-08 15:57:35.199[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m27[0m - [1mInitializing parser chain 'multi_refs_tagger' [0m
[32m2024-07-08 15:57:35.241[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m27[0m - [1mInitializing parser chain 'topics' [0m
[32m2024-07-08 15:57:35.268[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m27[0m - [1mInitializing parser chain 'keywords' [0m
[

In [5]:
res.reference_urls

['https://arxiv.org/abs/2402.04607']

In [6]:
res.item_types

['preprint']

In [12]:
def convert_item_types_to_rdf_triplets(item_types: List[str], reference_urls: List[str]) -> List[RDFTriplet]:
    assert len(res.reference_urls) == len(res.item_types)
    triplets = [
        RDFTriplet(
            subject=URIRef(ref_url),
            predicate=URIRef(ZoteroItemTypeDefinition().uri),
            object=Literal(item_type),
        )
        for ref_url, item_type in zip(reference_urls, item_types)
    ]

    return triplets

In [13]:
convert_item_types_to_rdf_triplets(res.item_types, res.reference_urls)

ValidationError: 2 validation errors for RDFTriplet
subject.is-instance[Literal]
  Input should be an instance of Literal [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of
subject.is-instance[URIRef]
  Input should be an instance of URIRef [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of