In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
import sys
sys.path.append("../")

import asyncio
import time
from typing import List, Optional, Dict
from pydantic import (
    Field,
    BaseModel,
    validator,
    ConfigDict,
    field_validator,
    field_serializer,
)
from langchain_core.runnables import RunnableParallel

from desci_sense.configs import default_init_parser_config
from desci_sense.shared_functions.dataloaders import scrape_post
from desci_sense.shared_functions.schema.post import RefPost
from desci_sense.shared_functions.parsers.firebase_api_parser import FirebaseAPIParser, PromptCase
from desci_sense.shared_functions.web_extractors.metadata_extractors import (
    RefMetadata,
)

In [3]:
config = default_init_parser_config(semantics_model="mistralai/mistral-7b-instruct",
                                    kw_model="mistralai/mistral-7b-instruct")
config["general"]["ref_metadata_method"] = "citoid"

In [4]:
# get a few posts for input
urls = ["https://mastodon.social/@psmaldino@qoto.org/111405098400404613",
        "https://mastodon.social/@UlrikeHahn@fediscience.org/111732713776994953",
        "https://mastodon.social/@ronent/111687038322549430"]
posts = [scrape_post(url) for url in urls]

In [5]:
len(posts)

3

In [6]:
parser = FirebaseAPIParser(config=config)

[32m2024-03-19 21:20:45.638[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36mset_md_extract_method[0m:[36m156[0m - [1mSetting metadata extraction method to citoid...[0m
[32m2024-03-19 21:20:45.639[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36m__init__[0m:[36m116[0m - [1mLoading parser model (type=mistralai/mistral-7b-instruct)...[0m
  warn_deprecated(
[32m2024-03-19 21:20:45.898[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36mset_kw_md_extract_method[0m:[36m160[0m - [1mSetting keywords metadata extraction method to citoid...[0m
[32m2024-03-19 21:20:45.898[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36minit_keyword_extraction_chain[0m:[36m450[0m - [1mLoading keyword model (type=mistralai/mistral-7b-instruct)...[0m
[32m2024-03-19 21:20:45.924[0m | [1mINFO    [0m | [36m

In [7]:
parser.md_extract_method

<MetadataExtractionType.CITOID: 'citoid'>

In [10]:
res = parser.abatch_process_ref_post(posts)

[32m2024-03-19 21:23:04.692[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://write.as/ulrikehahn/some-thoughts-on-social-media-for-science[0m
[32m2024-03-19 21:23:04.693[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://paragraph.xyz/@sense-nets/sense-nets-intro[0m
[32m2024-03-19 21:23:04.693[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://royalsocietypublishing.org/doi/10.1098/rstb.2022.0267[0m
[32m2024-03-19 21:23:04.694[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://paragraph.xyz/@sense-nets/2-project-plan[0m


In [12]:
res[2].item_types

['webpage', 'webpage']

In [10]:
md_dict

{'https://paragraph.xyz/@sense-nets/sense-nets-intro': RefMetadata(citoid_url='https://paragraph.xyz/@sense-nets/sense-nets-intro', url='https://paragraph.xyz/@sense-nets/sense-nets-intro', item_type='webpage', title='Sensemaking Networks: Project Introduction', summary='Incorporating science social media into the scientific process', image=''),
 'https://write.as/ulrikehahn/some-thoughts-on-social-media-for-science': RefMetadata(citoid_url='https://write.as/ulrikehahn/some-thoughts-on-social-media-for-science', url='https://write.as/ulrikehahn/some-thoughts-on-social-media-for-science', item_type='webpage', title='Some Thoughts on Social Media for Science', summary='What follows are some more or less connected thoughts on what social media for science could and should be. There are excellent articulat...', image=''),
 'https://paragraph.xyz/@sense-nets/2-project-plan': RefMetadata(citoid_url='https://paragraph.xyz/@sense-nets/2-project-plan', url='https://paragraph.xyz/@sense-nets/2-p

In [10]:
config

{'general': {'parser_type': 'multi_stage',
  'ref_metadata_method': 'none',
  'max_summary_length': 500},
 'keyword_extraction': {'enabled': True,
  'template': 'keywords_extraction.j2',
  'ref_metadata_method': 'citoid',
  'max_keywords': 6,
  'model': {'model_name': 'mistralai/mistral-7b-instruct',
   'temperature': 0.6}},
 'model': {'model_name': 'mistralai/mistral-7b-instruct', 'temperature': 0.6},
 'ontology': {'versions': None, 'notion_db_id': None},
 'openai_api': {'openai_api_base': 'https://openrouter.ai/api/v1',
  'openai_api_key': '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',
  'openai_api_referer': 'http://localhost:3000'},
 'wandb': {'entity': 'common-sense-makers', 'project': 'st-demo-sandbox'}}

In [11]:
res[2].semantic_tags

['indicates-interest']

In [25]:
class StreamlitParserResults(BaseModel):
    research_filter: str
    item_types: List[str]
    reference_urls: List[str]
    semantic_tags: List[str]
    keywords: List[str]
    debug: Optional[Dict] = Field(default_factory=dict)


In [26]:
def convert_raw_output_to_st_format(post: RefPost, 
                                    sem_prompt: str,
                                    kw_prompt: str,
    output: dict, md_dict: Dict[str, RefMetadata]
):
    reference_urls = post.ref_urls
    item_types = [md_dict[url].item_type for url in reference_urls]
    semantic_tags = output["semantics"]["multi_tag"]
    keywords = output["keywords"]["valid_keywords"]
    research_filter = output["keywords"]["academic_kw"]
    debug = {"semantics": {"prompt": sem_prompt, "reasoning": output["semantics"]["reasoning"],
                           "allowed_tags": output["semantics"]["allowed_tags"]}, 
             "kw_prompt": {"prompt": kw_prompt, "reasoning": output["keywords"]["reasoning"]}}
    return StreamlitParserResults(
        research_filter=research_filter,
        item_types=item_types,
        reference_urls=reference_urls,
        semantic_tags=semantic_tags,
        keywords=keywords,
        debug=debug
    )

In [27]:
def convert_raw_outputs_to_st_format(
    posts: List[RefPost], outputs: List[dict], prompts, md_dict: Dict[str, RefMetadata]
) -> List[StreamlitParserResults]:
    assert len(prompts) == len(outputs)
    assert len(posts) == len(outputs)
    st_results = []
    for post, output, prompt_dict in zip(posts, outputs, prompts):
        st_result = convert_raw_output_to_st_format(post, 
                                        prompt_dict["input"],
                                        prompt_dict["kw_input"],
                                        output,
                                        md_dict)
        st_results.append(st_result)
    return st_results









In [16]:
item = res[0]

In [21]:
item

{'semantics': {'reasoning': "[Reasoning Steps]\n\n1. The author, Paul Smaldino, is an expert in the field of public goods and social choice theory.\n2. The content refers to a new paper that Smaldino has co-authored with Karolina Safarzynska.\n3. The paper discusses a simple model of a group-structured public good with externalities shifting benefits toward more inequality or more equity.\n4. The paper also explores the impact of redistributional externalities on cooperation in the model.\n\n[Candidate Tags]\n\n1. <call-for-papers>: This tag is not suitable as the post does not contain a call for research papers.\n2. <endorses>: This tag is not suitable as the post does not explicitly endorse the referenced paper.\n3. <disagrees>: This tag is not suitable as the post does not dispute or express disagreement with the referenced paper.\n4. <agrees>: This tag is not suitable as the post does not express agreement with the referenced paper.\n5. <indicates-interest>: This tag is not suitabl

In [18]:
item["semantics"]["multi_tag"]

['announce']

In [14]:

run_cost = 0.00021 * 2

In [15]:
2000 * run_cost

0.8400000000000001

In [8]:
print(res[2]['keywords']['raw_text'])

## Reasoning Steps:

Based on the input post, the following keywords can be assigned:

* #SensemakingNetworks
* #AsteraInstitute
* #ScienceResearch
* #BlogPost
* #ScienceCommunity
* #NewYear
* #Excited

## Candidate Keywords:

1. #SensemakingNetworks: The post mentions the author's upcoming project on Sensemaking Networks, which is likely related to scientific research.
2. #AsteraInstitute: The post mentions the author's work at AsteraInstitute, which suggests that the post is related to academic research.
3. #ScienceResearch: The post mentions "science research," indicating that the author is working on a research project.
4. #BlogPost: The post mentions that the author is crossposting from Birdsite, which suggests that the post is a blog post.
5. #ScienceCommunity: The post mentions the science community, indicating that the author is part of the scientific community.
6. #NewYear: The post mentions the new year, which suggests that the author is looking forward to the future.
7. #Exc

In [9]:
res[2]['keywords']

{'reasoning': '[Reasoning Steps]\n\nBased on the input post, the following keywords can be assigned:\n\n* #SensemakingNetworks\n* #AsteraInstitute\n* #ScienceResearch\n* #BlogPost\n* #ScienceCommunity\n* #NewYear\n* #Excited\n\n##\n\n[Candidate Keywords]\n\n1. #SensemakingNetworks: The post mentions the author\'s upcoming project on Sensemaking Networks, which is likely related to scientific research.\n2. #AsteraInstitute: The post mentions the author\'s work at AsteraInstitute, which suggests that the post is related to academic research.\n3. #ScienceResearch: The post mentions "science research," indicating that the author is working on a research project.\n4. #BlogPost: The post mentions that the author is crossposting from Birdsite, which suggests that the post is a blog post.\n5. #ScienceCommunity: The post mentions the science community, indicating that the author is part of the scientific community.\n6. #NewYear: The post mentions the new year, which suggests that the author is 

In [16]:
res[2]["semantics"]['multi_tag']

['announce']

In [8]:
chain = parser.prompt_case_dict[PromptCase.SINGLE_REF]["chain"]

In [9]:
input_prompts = [parser.create_semantics_prompt_by_case(post, PromptCase.SINGLE_REF) for post in posts]

In [10]:
inputs = [{"input": prompt} for prompt in input_prompts]

In [11]:
results = await chain.abatch(inputs)

In [12]:
results[0]

{'reasoning': "[Reasoning Steps]\n\n1. The post is about a new paper published in Phil Trans B.\n2. The paper is authored by Paul Smaldino and Karolina Safarzynska.\n3. The paper is about a simple model of a group-structured public good with externalities shifting benefits toward more inequality or more equity.\n4. The post does not explicitly endorse or disagree with the paper's content.\n5. The post does not contain any calls for papers, funding, or job listings.\n\n[Candidate Tags]\n\n<paper>, <public-good>, <equity>, <inequality>, <group-structure>, <externalities>",
 'final_answer': '<paper>, <public-good>, <equity>, <inequality>, <group-structure>, <externalities>',
 'single_tag': [],
 'multi_tag': []}

In [17]:
print(results[0]["reasoning"])

[Reasoning Steps]

1. The post is about a new paper published in Phil Trans B.
2. The paper is authored by Paul Smaldino and Karolina Safarzynska.
3. The paper is about a simple model of a group-structured public good with externalities shifting benefits toward more inequality or more equity.
4. The post does not explicitly endorse or disagree with the paper's content.
5. The post does not contain any calls for papers, funding, or job listings.

[Candidate Tags]

<paper>, <public-good>, <equity>, <inequality>, <group-structure>, <externalities>


In [18]:
# https://python.langchain.com/docs/expression_language/how_to/functions
from langchain_core.runnables import RunnableConfig

In [19]:
config = RunnableConfig(max_concurrency=3)

In [20]:
# make list of 6 inputs 
inputs = inputs * 2
len(inputs)

6

In [28]:
times = []
for ii in inputs:
    start_time = time.time()  
    res = chain.invoke(ii)
    end_time = time.time()
    elapsed_time = end_time - start_time
    times.append(elapsed_time)
    print(elapsed_time)
print(f"Total time: {sum(times)}")


7.781785011291504
13.621722221374512
10.330399990081787
9.885744094848633
21.70015788078308
18.53901195526123
Total time: 81.85882115364075


In [24]:
# 1 batch of 6
start_time = time.time()  
results = await chain.abatch(inputs)
end_time = time.time()
elapsed_time = end_time - start_time
print("\nAll tasks completed in {:.2f} seconds".format(elapsed_time))



All tasks completed in 21.54 seconds


In [23]:
# 2 batches of 3
start_time = time.time()  
results = await chain.abatch(inputs, config=config)
end_time = time.time()
elapsed_time = end_time - start_time
print("\nAll tasks completed in {:.2f} seconds".format(elapsed_time))



All tasks completed in 29.17 seconds


# Can we run a RunnableParallel chain in abatch mode?

In [25]:
# create parallel chain
chain_1 = parser.prompt_case_dict[PromptCase.SINGLE_REF]["chain"]
chain_2 = parser.prompt_case_dict[PromptCase.SINGLE_REF]["chain"]

map_chain = RunnableParallel(semantics=chain_1, keywords=chain_2)

In [26]:
results = await map_chain.abatch(inputs[:3])

In [27]:
results[0]

{'semantics': {'reasoning': '[Reasoning Steps]\n\n1. The post is about a new paper published in Phil Trans B.\n2. The paper is about a group-structured public good with externalities.\n3. The paper discusses how externalities affect cooperation and how only redistributional externalities increase cooperation.\n\n[Candidate Tags]\n\n1. <endorses>: This tag could be used to indicate that the author is endorsing the paper. However, since there is no explicit statement of endorsement, this tag is not the best fit.\n2. <agrees>: This tag could be used to indicate that the author agrees with the conclusions of the paper. However, since there is no explicit statement of agreement, this tag is not the best fit.\n3. <disagrees>: This tag could be used to indicate that the author disagrees with the conclusions of the paper. However, since there is no explicit statement of disagreement, this tag is not the best fit.\n4. <indicates-interest>: This tag could be used to indicate that the author is i

Looks like yes!