In [1]:
import nest_asyncio
nest_asyncio.apply()

import sys
sys.path.append("../")

from desci_sense.shared_functions.init import init_multi_chain_parser_config
from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser

from desci_sense.shared_functions.dataloaders import (
    scrape_post,
    convert_text_to_ref_post,
)

In [2]:
config = init_multi_chain_parser_config(llm_type="google/gemma-7b-it",
                                        post_process_type="combined")

In [3]:
multi_chain_parser = MultiChainParser(config)

[32m2024-04-24 15:40:59.519[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m65[0m - [1mInitializing MultiChainParser. PostProcessType=combined[0m
[32m2024-04-24 15:40:59.523[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m72[0m - [1mInitializing post parsers...[0m
[32m2024-04-24 15:40:59.524[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'refs_tagger' [0m
[32m2024-04-24 15:40:59.571[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'topics' [0m
[32m2024-04-24 15:40:59.600[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'keywords' [0m
[32m202

In [17]:
config.model_dump()

{'openrouter_api_config': {'openrouter_api_base': 'https://openrouter.ai/api/v1'},
 'parser_configs': [{'name': 'refs_tagger',
   'type': <ParserChainType.REFERENCE_TAGGER: 'reference_tagger'>,
   'llm_config': {'llm_type': 'google/gemma-7b-it', 'temperature': '0.6'},
   'use_metadata': True,
   'max_keywords': 6},
  {'name': 'topics',
   'type': <ParserChainType.TOPICS: 'topics'>,
   'llm_config': {'llm_type': 'google/gemma-7b-it', 'temperature': '0.6'},
   'use_metadata': True,
   'max_keywords': 6},
  {'name': 'keywords',
   'type': <ParserChainType.KEYWORDS: 'keywords'>,
   'llm_config': {'llm_type': 'google/gemma-7b-it', 'temperature': '0.6'},
   'use_metadata': True,
   'max_keywords': 6},
  {'name': 'hashtags',
   'type': <ParserChainType.HASHTAGS: 'hashtags'>,
   'llm_config': {'llm_type': 'mistralai/mistral-7b-instruct',
    'temperature': '0.6'},
   'use_metadata': False,
   'max_hashtags': 20}],
 'metadata_extract_config': {'extraction_method': <MetadataExtractionType.CITOID

In [4]:
# example raw text posts
TEST_POST_TEXT_W_REF = """
I really liked this paper!
https://arxiv.org/abs/2402.04607
"""

TEST_POST_2_TEXT_W_REF = """
I really disagree with this paper!
https://arxiv.org/abs/2402.04607
"""

TEXT_POSTS = [TEST_POST_TEXT_W_REF, TEST_POST_2_TEXT_W_REF]

In [5]:
# convert raw text to RefPost format for input to the parser
inputs = [convert_text_to_ref_post(p) for p in TEXT_POSTS]

In [12]:
# or scrape posts by url
urls = [
        "https://mastodon.social/@psmaldino@qoto.org/111405098400404613",
        "https://mastodon.social/@UlrikeHahn@fediscience.org/111732713776994953",
        "https://mastodon.social/@ronent/111687038322549430",
    ]
posts = [scrape_post(url) for url in urls]

In [6]:
# batch process
results = multi_chain_parser.batch_process_ref_posts(inputs)

[32m2024-04-24 15:41:10.622[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation[0m:[36m54[0m - [34m[1mfetching citoid data for: https://arxiv.org/abs/2402.04607[0m
[32m2024-04-24 15:41:12.813[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m231[0m - [34m[1mProcessing 2 posts with parsers: ['refs_tagger', 'topics', 'keywords', 'hashtags'][0m
[32m2024-04-24 15:41:12.815[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m233[0m - [34m[1mInstantiating prompts...[0m
[32m2024-04-24 15:41:12.816[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m243[0m - [34m[1mInvoking parallel chain...[0m
[32m2024-04-24 15:41:16.924[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.share

In [10]:
dd = results[0].model_dump()
dd

{'research_keyword': 'academic',
 'filter_classification': <SciFilterClassfication.RESEARCH: 'research'>,
 'item_types': ['preprint'],
 'reference_urls': ['https://arxiv.org/abs/2402.04607'],
 'reference_tagger': ['agrees', 'recommendation'],
 'keywords': ['AcademicMetrics',
  'GoogleScholar',
  'CitationManipulation',
  'SelfCitations',
  'CitationCartels',
  'ResearchIntegrity'],
 'topics': ['technology', 'academia', 'research'],
 'hashtags': [],
 'metadata_list': [{'citoid_url': 'http://arxiv.org/abs/2402.04607',
   'url': 'https://arxiv.org/abs/2402.04607',
   'item_type': 'preprint',
   'title': 'Google Scholar is manipulatable',
   'summary': "Citations are widely considered in scientists' evaluation. As such, scientists may be incentivized to inflate their citation counts. While previous literature has examined self-citations and citation cartels, it remains unclear whether scientists can purchase citations. Here, we compile a dataset of ~1.6 million profiles on Google Scholar t

In [12]:
dd["debug"]["reference_tagger"]

{'allowed_tags': ['endorses',
  'disagrees',
  'agrees',
  'watching',
  'reading',
  'listening',
  'default',
  'review',
  'recommendation',
  'question',
  'quote',
  'discussion',
  'event',
  'job',
  'announce'],
 'full_text': "**Reasoning Steps:**\n\n* The author expressed a positive sentiment towards the paper.\n* The post contains a direct link to the paper.\n\n\n**Candidate Tags:**\n\n* <agrees> - The author explicitly expressed a liking of the paper.\n* <recommendation> - The author clearly recommends the paper.\n\n\n**Final Answer:**\n\n**<agrees> <recommendation>** \n\n ##Allowed terms: ['endorses', 'disagrees', 'agrees', 'watching', 'reading', 'listening', 'default', 'review', 'recommendation', 'question', 'quote', 'discussion', 'event', 'job', 'announce']",
 'prompt': '\n    You are an expert annotator tasked with converting social media posts about scientific research to a structured semantic format. The input post contains a reference to an external URL. Your job is t

In [14]:
# single process
result = multi_chain_parser.process_ref_post(posts[0])

[32m2024-04-16 10:21:28.353[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation[0m:[36m54[0m - [34m[1mfetching citoid data for: https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267[0m
[32m2024-04-16 10:21:29.502[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m173[0m - [34m[1mProcessing post with parsers: ['refs_tagger', 'topics', 'keywords'][0m
[32m2024-04-16 10:21:29.503[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m175[0m - [34m[1mInstantiating prompts...[0m
[32m2024-04-16 10:21:29.504[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m180[0m - [34m[1mInvoking parallel chain...[0m


In [16]:
result.filter_classification

<SciFilterClassfication.RESEARCH: 'research'>

In [18]:
result.reference_tagger

['recommendation', 'reading']

In [19]:
result.topics

['economics', 'research']