In [2]:
import nest_asyncio
nest_asyncio.apply()

import json
import traceback
import sys
sys.path.append("../")

from loguru import logger
from operator import itemgetter
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain_core.outputs import Generation
from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import StrOutputParser
from typing import List, Union, Any, Dict
from langchain_core.pydantic_v1 import BaseModel, Field

from desci_sense.shared_functions.configs import MultiParserChainConfig, MultiRefTaggerChainConfig, MetadataExtractionConfig, LLMConfig, ParserChainType
from desci_sense.shared_functions.init import init_multi_chain_parser_config
from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser
from desci_sense.shared_functions.web_extractors.metadata_extractors import extract_posts_ref_metadata_dict, RefMetadata
from desci_sense.shared_functions.dataloaders import (
    scrape_post,
    convert_text_to_ref_post,
)
from desci_sense.shared_functions.postprocessing import ParserChainOutput

In [3]:
multi_config = MultiParserChainConfig(
        parser_configs=[
            MultiRefTaggerChainConfig(name="multi_ref_tagger",
                                      llm_config=LLMConfig(llm_type="mistralai/mixtral-8x7b-instruct:nitro"))
        ],
        metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid")
    )

# mistralai/mixtral-8x7b-instruct:nitro
# openai/gpt-3.5-turbo

In [4]:
multi_chain_parser = MultiChainParser(multi_config)

[32m2024-04-30 08:04:35.580[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m65[0m - [1mInitializing MultiChainParser. PostProcessType=none[0m
[32m2024-04-30 08:04:35.589[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m72[0m - [1mInitializing post parsers...[0m
[32m2024-04-30 08:04:35.590[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'multi_ref_tagger' [0m


In [13]:
multi_chain_parser.pparsers["multi_ref_tagger"].parser_config

KeywordPParserChainConfig(name='multi_ref_tagger', type=<ParserChainType.MULTI_REF_TAGGER: 'multi_reference_tagger'>, llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6'), use_metadata=True, max_keywords=6)

In [5]:
TEST_POST_TEXT_W_NO_REFS = """
These 2 papers are highly recommended!
"""
# post_nr = convert_text_to_ref_post(TEST_POST_TEXT_W_NO_REFS)
# md_dict = extract_posts_ref_metadata_dict([post_nr], md_type=multi_config.metadata_extract_config.extraction_method)

In [6]:
res = multi_chain_parser.process_text(TEST_POST_TEXT_W_NO_REFS)

[32m2024-04-30 08:05:53.648[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m184[0m - [34m[1mProcessing post with parsers: ['multi_ref_tagger'][0m
[32m2024-04-30 08:05:53.649[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m186[0m - [34m[1mInstantiating prompts...[0m
[32m2024-04-30 08:05:53.649[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m191[0m - [34m[1mInvoking parallel chain...[0m


In [8]:
res.keys()

dict_keys(['multi_ref_tagger'])

In [10]:
res["multi_ref_tagger"].answer.sub_answers[0].final_answer

['missing-ref']

In [14]:
answer = res["multi_ref_tagger"].answer

In [16]:
answer.dict()

'{"sub_answers": [{"ref_number": 1, "reasoning_steps": "The post is recommending two scientific papers, but it does not provide any links to the papers. I will tag this post as <missing-ref> because the papers are only mentioned by name.", "candidate_tags": ["<missing-ref>", "<dg-observation>", "<reading>"], "final_answer": ["missing-ref"]}], "debug": {}}'

In [56]:

res = chain.invoke({"input_prompt": prompt, "allowed_terms": ["1"]})
res

{'answer_chain': Answer(sub_answers=[SubAnswer(ref_number=1, reasoning_steps='The post is recommending two scientific papers, but is not providing links to them. I will tag this as <missing-ref> because the papers are only identified by name.', candidate_tags=['<missing-ref>'], final_answer=['<missing-ref>']), SubAnswer(ref_number=1, reasoning_steps='The post is recommending two scientific papers, which implies that the author has read them in the past. I will tag this as <reading>.', candidate_tags=['<reading>'], final_answer=['<reading>'])], debug={}),
 'allowed_terms': ['1']}

In [32]:
res.sub_answers[0].final_answer

['<missing-ref>']

In [None]:
# https://twitter.com/akhila_yerukola/status/1784011263078478200

In [33]:
url = "https://mastodon.social/@psmaldino@qoto.org/111405098400404613"
post = scrape_post(url)

In [34]:
md_dict = extract_posts_ref_metadata_dict([post], md_type=multi_config.metadata_extract_config.extraction_method)
md_dict

[32m2024-04-29 08:24:20.935[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation[0m:[36m54[0m - [34m[1mfetching citoid data for: https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267[0m


{'https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267': RefMetadata(citoid_url='https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267', url='https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267', item_type='journalArticle', title='Reducing global inequality increases local cooperation: a simple model of group selection with a global externality', summary='', image='')}

In [35]:
prompt_dict = multi_ref_parser.instantiate_prompt(post, md_dict)

In [8]:
prompt_dict.keys()

dict_keys(['multi_ref_tagger_input', 'multi_ref_tagger_input_allowed_terms'])

In [36]:
prompt = prompt_dict["multi_ref_tagger_input"]
print(prompt)


You are an expert annotator tasked with converting social media posts about scientific research to a structured semantic format. The input post contains a reference to an external URL. Your job is to select the tags best characterizing the relation of the post to the external reference, from a predefined set of tags. 

The available tag types are:
<disagrees>: this post disputes or expresses disagreement with statements, ideas or conclusions presented in the mentioned reference.
<review>: this post contains a review of another reference, such as a book, article or movie. The review could be positive or negative. A review can be detailed or a simple short endorsement.
<announce>: this post contains an announcement of new research. The announcement is likely made by the authors but may be a third party. We use a broad definition of research that includes classic and non-traditional outputs. Classic outputs include papers, datasets or code. Non traditional outputs can include a podcast, 

In [37]:
res = chain.invoke({"input_prompt": prompt})
res

Answer(sub_answers=[SubAnswer(ref_number=1, reasoning_steps='The post is written by Paul Smaldino, who is also an author of the mentioned paper. The post contains a link to the paper and a brief description of its content. This suggests that the post is related to the paper and is most likely announcing or endorsing it.', candidate_tags=[{'tag': '<announce>', 'reason': "Paul Smaldino is one of the authors of the paper and the post contains a link to the paper, indicating that it's a new research output."}, {'tag': '<endorses>', 'reason': 'Paul Smaldino is one of the authors of the paper and is sharing it on social media, which can be seen as an endorsement of the work.'}], final_answer=['<announce>', '<endorses>'])], debug={})

In [38]:
res.sub_answers[0].candidate_tags

[{'tag': '<announce>',
  'reason': "Paul Smaldino is one of the authors of the paper and the post contains a link to the paper, indicating that it's a new research output."},
 {'tag': '<endorses>',
  'reason': 'Paul Smaldino is one of the authors of the paper and is sharing it on social media, which can be seen as an endorsement of the work.'}]

In [39]:
res.sub_answers[0].final_answer

['<announce>', '<endorses>']

In [41]:
result = multi_ref_parser.chat(prompt)
print(result)

```json
{
	"reasoning_steps": "The post by Paul Smaldino announces a new paper published in the journal Phil Trans B with Karolina Safarzynska. The post provides a brief description of the paper's content and includes a link to the publication. Therefore, the post falls under the <announce> tag.",
	"candidate_tags": {
		"<announce>": "The post announces the publication of a new research paper in a journal, providing details about the content and a link to the paper.",
		"<endorses>": "The post implicitly endorses the mentioned reference by sharing information about it and highlighting the significance of the model discussed in the paper."
	},
	"final_answer": ["<announce>"]
}
```


In [40]:
multi_ref_parser.parser_config

KeywordPParserChainConfig(name='multi_ref_tagger', type=<ParserChainType.MULTI_REF_TAGGER: 'multi_reference_tagger'>, llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6'), use_metadata=True, max_keywords=6)

In [41]:
cfg = MultiRefTaggerChainConfig(name="multi_ref_tagger")
cfg

MultiRefTaggerChainConfig(name='multi_ref_tagger', type=<ParserChainType.MULTI_REF_TAGGER: 'multi_reference_tagger'>, llm_config=LLMConfig(llm_type='mistralai/mistral-7b-instruct', temperature='0.6'), use_metadata=True)

In [42]:
TEST_POST_TEXT_W_2_REFS = """
These 2 papers are highly recommended!
https://arxiv.org/abs/2402.04607
https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267
"""
post_2 = convert_text_to_ref_post(TEST_POST_TEXT_W_2_REFS)
md_dict = extract_posts_ref_metadata_dict([post_2], md_type=multi_config.metadata_extract_config.extraction_method)

[32m2024-04-29 08:25:41.824[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://arxiv.org/abs/2402.04607[0m
[32m2024-04-29 08:25:41.845[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267[0m


In [43]:
prompt_dict = multi_ref_parser.instantiate_prompt(post_2, md_dict)
prompt_2 = prompt_dict["multi_ref_tagger_input"]

In [44]:
res = chain.invoke({"input_prompt": prompt_2})
res

Answer(sub_answers=[SubAnswer(ref_number=1, reasoning_steps='The author recommends this preprint, which calls into question the validity of Google Scholar as a metric for scientific evaluation.', candidate_tags='[<recommendation>, <announce>, <discussion>]', final_answer=['<recommendation>', '<announce>', '<discussion>']), SubAnswer(ref_number=2, reasoning_steps='The author recommends this journal article, which presents a model of group selection with a global externality.', candidate_tags='[<recommendation>, <announce>, <discussion>]', final_answer=['<recommendation>', '<announce>', '<discussion>'])], debug={})

In [46]:
print(res.sub_answers[0].final_answer)

['<recommendation>', '<announce>', '<discussion>']


In [45]:
print(res.sub_answers[1].final_answer)

['<recommendation>', '<announce>', '<discussion>']


In [19]:
res = db_chain.invoke({"input_prompt": prompt_2})
res

AIMessage(content='\n{\n  "sub_answers": [\n    {\n      "ref_number": 1,\n      "reasoning_steps": "The post explicitly states that the paper is highly recommended, indicating an endorsement or recommendation. The paper discusses the issue of citation fraud on Google Scholar, which aligns with the content of the post.",\n      "candidate_tags": "[<endorses>, <discussion>, <recommendation>]",\n      "final_answer": [\n        "<endorses>",\n        "<recommendation>"\n      ]\n    },\n    {\n      "ref_number": 2,\n      "reasoning_steps": "The post explicitly states that the paper is highly recommended, indicating an endorsement or recommendation. The summary of the paper suggests it explores the relationship between global inequality and local cooperation, which could lead to discussions or endorsement.",\n      "candidate_tags": "[<endorses>, <discussion>, <recommendation>]",\n      "final_answer": [\n        "<endorses>",\n        "<recommendation>"\n      ]\n    }\n  ]\n}', respon

In [20]:
res.content

'\n{\n  "sub_answers": [\n    {\n      "ref_number": 1,\n      "reasoning_steps": "The post explicitly states that the paper is highly recommended, indicating an endorsement or recommendation. The paper discusses the issue of citation fraud on Google Scholar, which aligns with the content of the post.",\n      "candidate_tags": "[<endorses>, <discussion>, <recommendation>]",\n      "final_answer": [\n        "<endorses>",\n        "<recommendation>"\n      ]\n    },\n    {\n      "ref_number": 2,\n      "reasoning_steps": "The post explicitly states that the paper is highly recommended, indicating an endorsement or recommendation. The summary of the paper suggests it explores the relationship between global inequality and local cooperation, which could lead to discussions or endorsement.",\n      "candidate_tags": "[<endorses>, <discussion>, <recommendation>]",\n      "final_answer": [\n        "<endorses>",\n        "<recommendation>"\n      ]\n    }\n  ]\n}'

In [59]:
output_wrangler(res)

'{"sub_answers": [{"ref_number": 1, "reasoning_steps": "The post explicitly states that the paper at the first link is highly recommended, indicating an endorsement. The paper discusses the manipulatability of Google Scholar and citation fraud, which is a significant issue in scientific research.", "candidate_tags": "[<endorse>, <discussion>]", "final_answer": ["<endorse>"]}, {"ref_number": 2, "reasoning_steps": "The post explicitly states that the paper at the second link is highly recommended, indicating an endorsement. The paper discusses reducing global inequality and its effect on local cooperation, which is a relevant topic in social science research.", "candidate_tags": "[<endorse>, <discussion>]", "final_answer": ["<endorse>"]}]}'

In [37]:
result = multi_ref_parser.chat(prompt_2)
print(result)

[
    {
        "ref_number": 1,
        "reasoning_steps": "The post explicitly states that the first paper is highly recommended, indicating an endorsement of the content. The paper discusses the manipulability of Google Scholar in terms of citation fraud, which is a significant issue in scientific research.",
        "candidate_tags": "[<endorses>, <discussion>, <recommendation>]",
        "final_answer": [
            "<endorses>",
            "<recommendation>"
        ]
    },
    {
        "ref_number": 2,
        "reasoning_steps": "The post explicitly states that the second paper is highly recommended, indicating an endorsement of the content. The paper discusses reducing global inequality to increase local cooperation, which is a relevant topic in social sciences.",
        "candidate_tags": "[<endorses>, <discussion>, <recommendation>]",
        "final_answer": [
            "<endorses>",
            "<recommendation>"
        ]
    }
]


In [57]:
print(pydantic_output_parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"ref_number": {"title": "Ref Number", "description": "ID number of current reference", "type": "integer"}, "reasoning_steps": {"title": "Reasoning Steps", "description": "your reasoning steps", "type": "string"}, "candidate_tags": {"title": "Candidate Tags", "description": "For potential each tag you choose, explain why you chose it.", "type": "string"}, "final_answer": {"title": "Final Answer", "description": "Set of final tags, based on the Candidate Tags. The final tags must be included in the Candidate Tags list!", "type": 

In [9]:
test_output = """
bla bla bla ```
[{
        "ref_number": 2,
        "reasoning_steps": "The post explicitly states that the second paper is highly recommended, indicating an endorsement of the content. The paper discusses reducing global inequality to increase local cooperation, which is a relevant topic in social sciences.",
        "candidate_tags": "[<endorses>, <discussion>, <recommendation>]",
        "final_answer": [
            "<endorses>",
            "<recommendation>"
        ]
    }]
```
bla blas
"""

In [10]:
res = parse_json_substring(test_output)
res

[{'ref_number': 2,
  'reasoning_steps': 'The post explicitly states that the second paper is highly recommended, indicating an endorsement of the content. The paper discusses reducing global inequality to increase local cooperation, which is a relevant topic in social sciences.',
  'candidate_tags': '[<endorses>, <discussion>, <recommendation>]',
  'final_answer': ['<endorses>', '<recommendation>']}]

In [15]:
res[0]["ref_number"]

2

In [8]:
type(res[0])

dict

In [7]:
type(res) == list

True

In [61]:
pydantic_output_parser.parse(test_output)

OutputParserException: Failed to parse SubAnswer from completion [{'ref_number': 2, 'reasoning_steps': 'The post explicitly states that the second paper is highly recommended, indicating an endorsement of the content. The paper discusses reducing global inequality to increase local cooperation, which is a relevant topic in social sciences.', 'candidate_tags': '[<endorses>, <discussion>, <recommendation>]', 'final_answer': ['<endorses>', '<recommendation>']}]. Got: 1 validation error for SubAnswer
__root__
  SubAnswer expected dict not list (type=type_error)