In [1]:
import nest_asyncio
nest_asyncio.apply()

import json
import traceback
import sys
sys.path.append("../")

from loguru import logger
from operator import itemgetter
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain_core.outputs import Generation
from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import StrOutputParser
from typing import List, Union, Any, Dict
from langchain_core.pydantic_v1 import BaseModel, Field

from desci_sense.shared_functions.configs import MultiParserChainConfig, MultiRefTaggerChainConfig, MetadataExtractionConfig, LLMConfig, ParserChainType, RefTaggerChainConfig
from desci_sense.shared_functions.init import init_multi_chain_parser_config
from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser
from desci_sense.shared_functions.parsers.multi_reference_tagger import normalize_labels, normalize_references
from desci_sense.shared_functions.web_extractors.metadata_extractors import extract_posts_ref_metadata_dict, RefMetadata
from desci_sense.shared_functions.dataloaders import (
    scrape_post,
    convert_text_to_ref_post,
)
from desci_sense.shared_functions.utils import _find_json_object
from desci_sense.shared_functions.postprocessing import ParserChainOutput

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
multi_config = MultiParserChainConfig(
        parser_configs=[
            MultiRefTaggerChainConfig(name="multi_ref_tagger",
                                      llm_config=LLMConfig(llm_type="mistralai/mixtral-8x7b-instruct:nitro")),
                                      RefTaggerChainConfig(name="ref_tagger",
                                      llm_config=LLMConfig(llm_type="mistralai/mixtral-8x7b-instruct:nitro")),
        ],
        post_process_type="combined",
        metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid")
    )

# mistralai/mixtral-8x7b-instruct:nitro
# openai/gpt-3.5-turbo

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (326804237.py, line 7)

In [3]:
multi_chain_parser = MultiChainParser(multi_config)

[32m2024-05-03 14:40:02.692[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m66[0m - [1mInitializing MultiChainParser. PostProcessType=none[0m
[32m2024-05-03 14:40:02.699[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m73[0m - [1mInitializing post parsers...[0m
[32m2024-05-03 14:40:02.700[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'multi_ref_tagger' [0m
[32m2024-05-03 14:40:02.768[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'ref_tagger' [0m


In [12]:
multi_chain_parser.pparsers["multi_ref_tagger"].parser_config

KeywordPParserChainConfig(name='multi_ref_tagger', type=<ParserChainType.MULTI_REF_TAGGER: 'multi_reference_tagger'>, llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6'), use_metadata=True, max_keywords=6)

In [4]:
TEST_POST_TEXT_W_NO_REFS = """
These 2 papers are highly recommended!
"""
batch = [convert_text_to_ref_post(TEST_POST_TEXT_W_NO_REFS) for _ in range(6)]
# post_nr = convert_text_to_ref_post(TEST_POST_TEXT_W_NO_REFS)
# md_dict = extract_posts_ref_metadata_dict([post_nr], md_type=multi_config.metadata_extract_config.extraction_method)

In [7]:
res = multi_chain_parser.batch_process_ref_posts(batch, batch_size=6)

[32m2024-05-01 17:03:30.984[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m232[0m - [34m[1mProcessing 6 posts with parsers: ['multi_ref_tagger'][0m
[32m2024-05-01 17:03:30.984[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m234[0m - [34m[1mInstantiating prompts...[0m
  0%|          | 0/6 [00:00<?, ?it/s][32m2024-05-01 17:03:30.986[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m245[0m - [34m[1mInvoking parallel chain...[0m
100%|██████████| 6/6 [00:07<00:00,  1.28s/it]
[32m2024-05-01 17:03:38.636[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m256[0m - [34m[1mPost processing 6 results...[0m
[32m2024-05-01 17:03:38.637[0m | [34m[

In [9]:
for i in range(len(res)):
    print(res[i]['multi_ref_tagger'].answer.sub_answers[0].final_answer)

['missing-ref']
['missing-ref']
['missing-ref']
['missing-ref']
['missing-ref']
['missing-ref']


In [5]:
res = multi_chain_parser.process_text(TEST_POST_TEXT_W_NO_REFS)

[32m2024-05-03 14:40:26.256[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m185[0m - [34m[1mProcessing post with parsers: ['multi_ref_tagger', 'ref_tagger'][0m
[32m2024-05-03 14:40:26.257[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m187[0m - [34m[1mInstantiating prompts...[0m
[32m2024-05-03 14:40:26.258[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m192[0m - [34m[1mInvoking parallel chain...[0m


In [10]:
answer = res["multi_ref_tagger"].answer
answer

Answer(sub_answers=[SubAnswer(ref_number=0, reasoning_steps='The post is recommending two scientific papers, but it does not provide a link to them, so the <missing-ref> tag is appropriate.', candidate_tags=['<missing-ref>', '<dg-observation>', '<reading>'], final_answer=['missing-ref'], ref_url=None)], debug={})

In [13]:
parser = multi_chain_parser.pparsers["multi_ref_tagger"]

In [10]:
post = convert_text_to_ref_post(TEST_POST_TEXT_W_NO_REFS)
md_dict = extract_posts_ref_metadata_dict([post], md_type=multi_config.metadata_extract_config.extraction_method)
prompt = parser.instantiate_prompt(post, md_dict)

In [12]:
res = parser.chat(prompt["multi_ref_tagger_input"])

In [13]:
res

' Here\'s the structured semantic format for the given social media post:\n\n```json\n{\n  "sub_answers": [\n    {\n      "reasoning_steps": "The post recommends two papers, but does not provide a URL link to them. Therefore, it does not fall under the <missing-ref> tag.",\n      "candidate_tags": [],\n      "final_answer": ["<reading>"]\n    }\n  ]\n}\n```\n\nThe post recommends two papers, which means the author has either read them in the past, is reading them in the present, or is looking forward to reading them in the future. Therefore, the <reading> tag is the most suitable tag for this post.'

In [17]:
_find_json_object(res)

'{\n  "sub_answers": [\n    {\n      "reasoning_steps": "The post recommends two papers, but does not provide a URL link to them. Therefore, it does not fall under the <missing-ref> tag.",\n      "candidate_tags": [],\n      "final_answer": ["<reading>"]\n    }\n  ]\n}'

In [14]:
parser.pydantic_parser.parse(None)

ValidationError: 1 validation error for Generation
text
  none is not an allowed value (type=type_error.none.not_allowed)

In [None]:
answer = res["multi_ref_tagger"].answer
res["multi_ref_tagger"].extra

{'prompt': '\nYou are an expert annotator tasked with converting social media posts about scientific research to a structured semantic format. For an input post, your job is to select the tags most suitable to that post, from a predefined set of tags. \n\n  The available tag types are:\n  <missing-ref>: this post seems to be referring to a reference by name but has not explicitly provided a URL link to the reference. For example, a post that discusses a book and mentions it by title, but contains no link to the book.\n  <dg-question>: this post is raising a research question.\n  <listening>: this post describes the listening status of the author in relation to a reference, such as a podcast or radio station. The author may have listened to the content in the past, is listening to the content in the present, or is looking forward to listening the content in the future.\n  <call-for-papers>: this post contains a call for research papers, for example to a journal, conference or workshop.\

In [None]:
answer

Answer(sub_answers=[SubAnswer(ref_number=0, reasoning_steps='The post is recommending 2 scientific papers, but is not providing direct links to them. I will tag this as <missing-ref> because it seems to be referring to references by name, but has not explicitly provided URL links to the references.', candidate_tags=['<missing-ref>', '<dg-observation>', '<reading>'], final_answer=['missing-ref'], ref_url=None)], debug={})

In [None]:
res["multi_ref_tagger"].answer.sub_answers[0].final_answer

['dg-claim', 'reading']

In [7]:
TEST_POST_TEXT_W_2_REFS = """
These 2 papers are highly recommended!
https://arxiv.org/abs/2402.04607
https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267
"""
# post_2 = convert_text_to_ref_post(TEST_POST_TEXT_W_2_REFS)
# md_dict = extract_posts_ref_metadata_dict([post_2], md_type=multi_config.metadata_extract_config.extraction_method)

In [6]:
res = multi_chain_parser.process_text(TEST_POST_TEXT_W_2_REFS)

[32m2024-05-01 09:38:34.876[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://arxiv.org/abs/2402.04607[0m
[32m2024-05-01 09:38:34.898[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267[0m
[32m2024-05-01 09:38:36.977[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m184[0m - [34m[1mProcessing post with parsers: ['multi_ref_tagger'][0m
[32m2024-05-01 09:38:36.979[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m186[0m - [34m[1mInstantiating prompts...[0m
[32m2024-05-01 09:38:36.981[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parser

In [8]:
print(res["multi_ref_tagger"].answer.sub_answers[0].final_answer)

['recommendation', 'reading']


In [9]:
print(res["multi_ref_tagger"].answer.sub_answers[0].ref_url)

https://arxiv.org/abs/2402.04607


In [10]:
print(res["multi_ref_tagger"].answer.sub_answers[1].final_answer)

['recommendation', 'reading']


In [8]:
answer = res["multi_ref_tagger"].reasoning
answer

{0: 'The post recommends this preprint, which suggests that Google Scholar is manipulatable through citation fraud. The author of the post likely agrees with the findings of the preprint and finds it interesting or important.',
 1: 'The post recommends this journal article, which presents a model of group selection with a global externality. The author of the post likely finds the article interesting or important, but there is no explicit agreement or disagreement expressed regarding its contents.'}

In [7]:
answer = res["multi_ref_tagger"].answer

In [8]:
answer.to_combined_format()

[['recommendation'], ['recommendation']]

In [13]:
sub_answers = answer.sub_answers
sub_answers

[SubAnswer(ref_number=0, reasoning_steps="The post recommends this preprint titled 'Google Scholar is manipulatable'.", candidate_tags='[<recommendation>, <reading>, <announce>]', final_answer=['recommendation', 'reading'], ref_url='https://arxiv.org/abs/2402.04607'),
 SubAnswer(ref_number=1, reasoning_steps="The post recommends this journal article titled 'Reducing global inequality increases local cooperation: a simple model of group selection with a global externality'.", candidate_tags='[<recommendation>, <reading>, <announce>]', final_answer=['recommendation', 'reading'], ref_url='https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267')]

In [14]:
sorted(sub_answers, key=lambda x: x.ref_number)

[SubAnswer(ref_number=0, reasoning_steps="The post recommends this preprint titled 'Google Scholar is manipulatable'.", candidate_tags='[<recommendation>, <reading>, <announce>]', final_answer=['recommendation', 'reading'], ref_url='https://arxiv.org/abs/2402.04607'),
 SubAnswer(ref_number=1, reasoning_steps="The post recommends this journal article titled 'Reducing global inequality increases local cooperation: a simple model of group selection with a global externality'.", candidate_tags='[<recommendation>, <reading>, <announce>]', final_answer=['recommendation', 'reading'], ref_url='https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267')]

In [2]:
multi_config = MultiParserChainConfig(
        parser_configs=[
            RefTaggerChainConfig(name="ref_tagger",
                                      llm_config=LLMConfig(llm_type="mistralai/mixtral-8x7b-instruct:nitro"))
        ],
        metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid")
    )
multi_chain_parser = MultiChainParser(multi_config)


[32m2024-05-01 18:22:32.929[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m66[0m - [1mInitializing MultiChainParser. PostProcessType=none[0m
[32m2024-05-01 18:22:32.932[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m73[0m - [1mInitializing post parsers...[0m
[32m2024-05-01 18:22:32.932[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'ref_tagger' [0m


In [5]:
res = multi_chain_parser.process_text(TEST_POST_TEXT_W_NO_REFS)

[32m2024-05-01 18:22:49.330[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m185[0m - [34m[1mProcessing post with parsers: ['ref_tagger'][0m
[32m2024-05-01 18:22:49.331[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m187[0m - [34m[1mInstantiating prompts...[0m
[32m2024-05-01 18:22:49.332[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m192[0m - [34m[1mInvoking parallel chain...[0m


In [6]:
res

{'ref_tagger': ParserChainOutput(answer=[['missing-ref', 'reading']], pparser_type=<ParserChainType.REFERENCE_TAGGER: 'reference_tagger'>, reasoning='[Reasoning Steps]\n\n1. The author is recommending two papers, which implies that they have read or are familiar with the content of these papers.\n2. The author does not provide explicit URL links to these papers, but they are clearly identified by name.\n\n[Candidate Tags]\n\n- <reading>: The author has likely read or is familiar with the content of the two papers they are recommending.\n- <missing-ref>: The author does not provide explicit URL links to the two papers they are recommending, but they are identified by name.', extra={'allowed_tags': ['missing-ref', 'dg-question', 'listening', 'call-for-papers', 'dg-observation', 'dg-claim', 'funding', 'watching', 'reading', 'event', 'job'], 'full_text': " Reasoning Steps:\n1. The author is recommending two papers, which implies that they have read or are familiar with the content of these

In [8]:
res = multi_chain_parser.process_text(TEST_POST_TEXT_W_2_REFS)

[32m2024-05-01 18:23:25.734[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m76[0m - [34m[1mtarget_url=https://arxiv.org/abs/2402.04607[0m
[32m2024-05-01 18:23:25.748[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m76[0m - [34m[1mtarget_url=https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267[0m
[32m2024-05-01 18:23:27.938[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m185[0m - [34m[1mProcessing post with parsers: ['ref_tagger'][0m
[32m2024-05-01 18:23:27.939[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m187[0m - [34m[1mInstantiating prompts...[0m
[32m2024-05-01 18:23:27.940[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.

In [9]:
res

{'ref_tagger': ParserChainOutput(answer=[['recommendation'], ['recommendation']], pparser_type=<ParserChainType.REFERENCE_TAGGER: 'reference_tagger'>, reasoning='[Reasoning Steps]\n\n1. The author of the post is recommending two scientific papers, indicating a <recommendation> tag.\n2. The post does not contain any disagreement or agreement with the papers, so no <disagrees> or <agrees> tags are applicable.\n3. The post does not contain a review, announcement, or discussion of the papers, so no <review>, <announce>, or <discussion> tags are applicable.\n4. The post does not indicate that the author is listening to, watching, or funding the papers, so no <listening>, <watching>, or <funding> tags are applicable.\n5. The post does not contain a question about the papers, so no <question> tag is applicable.\n6. The post does not mention an event, job, or indicate interest using the 👀 emoji, so no <event>, <job>, or <indicates-interest> tags are applicable.\n\n[Candidate Tags]\n\n* <recomm