In [1]:
import nest_asyncio
nest_asyncio.apply()

import sys
sys.path.append("../")

from desci_sense.shared_functions.init import init_multi_chain_parser_config
from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser

from desci_sense.shared_functions.dataloaders import (
    scrape_post,
    convert_text_to_ref_post,
)

In [2]:
config = init_multi_chain_parser_config(llm_type="google/gemma-7b-it",
                                        post_process_type="combined")

In [3]:
multi_chain_parser = MultiChainParser(config)

[32m2024-04-30 15:41:32.755[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m65[0m - [1mInitializing MultiChainParser. PostProcessType=combined[0m
[32m2024-04-30 15:41:32.759[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m72[0m - [1mInitializing post parsers...[0m
[32m2024-04-30 15:41:32.759[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'refs_tagger' [0m
[32m2024-04-30 15:41:32.807[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'topics' [0m
[32m2024-04-30 15:41:32.833[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'keywords' [0m
[32m202

In [4]:
# example raw text posts
TEST_POST_TEXT_W_REF = """
I really liked this paper!
https://arxiv.org/abs/2402.04607
"""

TEST_POST_2_TEXT_W_REF = """
I really disagree with this paper!
https://arxiv.org/abs/2402.04607
"""

TEXT_POSTS = [TEST_POST_TEXT_W_REF, TEST_POST_2_TEXT_W_REF]

In [5]:
# convert raw text to RefPost format for input to the parser
inputs = [convert_text_to_ref_post(p) for p in TEXT_POSTS]

In [12]:
# or scrape posts by url
urls = [
        "https://mastodon.social/@psmaldino@qoto.org/111405098400404613",
        "https://mastodon.social/@UlrikeHahn@fediscience.org/111732713776994953",
        "https://mastodon.social/@ronent/111687038322549430",
    ]
posts = [scrape_post(url) for url in urls]

In [6]:
# batch process
results = multi_chain_parser.batch_process_ref_posts(inputs)

[32m2024-04-24 15:41:10.622[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation[0m:[36m54[0m - [34m[1mfetching citoid data for: https://arxiv.org/abs/2402.04607[0m
[32m2024-04-24 15:41:12.813[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m231[0m - [34m[1mProcessing 2 posts with parsers: ['refs_tagger', 'topics', 'keywords', 'hashtags'][0m
[32m2024-04-24 15:41:12.815[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m233[0m - [34m[1mInstantiating prompts...[0m
[32m2024-04-24 15:41:12.816[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m243[0m - [34m[1mInvoking parallel chain...[0m
[32m2024-04-24 15:41:16.924[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.share

In [6]:
#
url = "https://twitter.com/mbauwens/status/1779543397528740338"
post = scrape_post(url)

In [7]:
# single process
result = multi_chain_parser.process_ref_post(post)

[32m2024-04-30 15:42:45.141[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://icandecide.org/v-safe-data/[0m
[32m2024-04-30 15:42:45.164[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://icandecide.org/pfizer-documents/[0m
[32m2024-04-30 15:43:04.309[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m184[0m - [34m[1mProcessing post with parsers: ['refs_tagger', 'topics', 'keywords', 'hashtags'][0m
[32m2024-04-30 15:43:04.310[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m186[0m - [34m[1mInstantiating prompts...[0m
[32m2024-04-30 15:43:04.311[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_funct

In [8]:
result.topics

['sustainability',
 'software & hardware',
 'academia',
 'climate',
 'news',
 'health',
 'technology',
 'research']

In [16]:
result.filter_classification

<SciFilterClassfication.RESEARCH: 'research'>

In [18]:
result.reference_tagger

['recommendation', 'reading']

In [19]:
result.topics

['economics', 'research']