In [1]:
import nest_asyncio
nest_asyncio.apply()

import sys
sys.path.append("../")

from desci_sense.shared_functions.init import init_multi_chain_parser_config
from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser

from desci_sense.shared_functions.dataloaders import (
    scrape_post,
    convert_text_to_ref_post,
)

In [8]:
config = init_multi_chain_parser_config(llm_type="google/gemma-7b-it",
                                        post_process_type="combined")

In [9]:
multi_chain_parser = MultiChainParser(config)

[32m2024-04-16 10:20:55.534[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m64[0m - [1mInitializing MultiChainParser. PostProcessType=combined[0m
[32m2024-04-16 10:20:55.537[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m71[0m - [1mInitializing post parsers...[0m
[32m2024-04-16 10:20:55.538[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m24[0m - [1mInitializing parser chain 'refs_tagger' [0m
[32m2024-04-16 10:20:55.580[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m24[0m - [1mInitializing parser chain 'topics' [0m
[32m2024-04-16 10:20:55.609[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m24[0m - [1mInitializing parser chain 'keywords' [0m


In [10]:
# example raw text posts
TEST_POST_TEXT_W_REF = """
I really liked this paper!
https://arxiv.org/abs/2402.04607
"""

TEST_POST_2_TEXT_W_REF = """
I really disagree with this paper!
https://arxiv.org/abs/2402.04607
"""

TEXT_POSTS = [TEST_POST_TEXT_W_REF, TEST_POST_2_TEXT_W_REF]

In [11]:
# convert raw text to RefPost format for input to the parser
inputs = [convert_text_to_ref_post(p) for p in TEXT_POSTS]

In [12]:
# or scrape posts by url
urls = [
        "https://mastodon.social/@psmaldino@qoto.org/111405098400404613",
        "https://mastodon.social/@UlrikeHahn@fediscience.org/111732713776994953",
        "https://mastodon.social/@ronent/111687038322549430",
    ]
posts = [scrape_post(url) for url in urls]

In [13]:
# batch process
results = multi_chain_parser.batch_process_ref_posts(inputs)

[32m2024-04-16 10:21:13.659[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation[0m:[36m54[0m - [34m[1mfetching citoid data for: https://arxiv.org/abs/2402.04607[0m
[32m2024-04-16 10:21:16.077[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m220[0m - [34m[1mProcessing 2 posts with parsers: ['refs_tagger', 'topics', 'keywords'][0m
[32m2024-04-16 10:21:16.078[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m222[0m - [34m[1mInstantiating prompts...[0m
[32m2024-04-16 10:21:16.079[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m232[0m - [34m[1mInvoking parallel chain...[0m
[32m2024-04-16 10:21:19.645[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.

In [14]:
# single process
result = multi_chain_parser.process_ref_post(posts[0])

[32m2024-04-16 10:21:28.353[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation[0m:[36m54[0m - [34m[1mfetching citoid data for: https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267[0m
[32m2024-04-16 10:21:29.502[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m173[0m - [34m[1mProcessing post with parsers: ['refs_tagger', 'topics', 'keywords'][0m
[32m2024-04-16 10:21:29.503[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m175[0m - [34m[1mInstantiating prompts...[0m
[32m2024-04-16 10:21:29.504[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m180[0m - [34m[1mInvoking parallel chain...[0m


In [16]:
result.filter_classification

<SciFilterClassfication.RESEARCH: 'research'>

In [18]:
result.reference_tagger

['recommendation', 'reading']

In [19]:
result.topics

['economics', 'research']