In [1]:
import sys
sys.path.append("../")
from pathlib import Path

ROOT = Path.cwd().parent

from enum import Enum
from confection import Config
from desci_sense.parsers.multi_stage_parser import MultiStageParser
from desci_sense.configs import init_multi_stage_parser_config
from desci_sense.schema.post import RefPost
from desci_sense.dataloaders import convert_text_to_ref_post, scrape_post
from desci_sense.web_extractors.metadata_extractors import MetadataExtractionType, RefMetadata, extract_metadata_by_type

In [2]:
config_path = ROOT / "etc/configs/notion_dev.cfg"
config = Config().from_disk(str(config_path))
config

{'general': {'parser_type': 'multi_stage', 'ref_metadata_method': 'none'},
 'keyword_extraction': {'enabled': True,
  'template': 'keywords_extraction.j2',
  'ref_metadata_method': 'citoid',
  'max_keywords': 6,
  'model': {'model_name': 'mistralai/mistral-7b-instruct',
   'temperature': 0.6000000000000001}},
 'model': {'model_name': 'mistralai/mistral-7b-instruct',
  'temperature': 0.6000000000000001},
 'ontology': {'versions': ['v0'],
  'notion_db_id': 'ba31bb4887624222be6ec622650514c9'},
 'prompt': {'template_dir': 'desci_sense/prompting/jinja/',
  'zero_ref_template_name': 'zero_ref_template.j2',
  'single_ref_template_name': 'single_ref_template.j2',
  'multi_ref_template_name': 'multi_ref_template.j2'},
 'wandb': {'entity': 'common-sense-makers', 'project': 'st-demo-sandbox'}}

In [3]:

parser = MultiStageParser(config)

[32m2024-02-02 16:15:41.319[0m | [1mINFO    [0m | [36mdesci_sense.parsers.multi_stage_parser[0m:[36mset_md_extract_method[0m:[36m141[0m - [1mSetting metadata extraction method to none...[0m
[32m2024-02-02 16:15:41.320[0m | [1mINFO    [0m | [36mdesci_sense.parsers.multi_stage_parser[0m:[36m__init__[0m:[36m106[0m - [1mLoading parser model (type=mistralai/mistral-7b-instruct)...[0m
                    headers was transferred to model_kwargs.
                    Please confirm that headers is what you intended.
[32m2024-02-02 16:15:41.324[0m | [1mINFO    [0m | [36mdesci_sense.parsers.multi_stage_parser[0m:[36mset_kw_md_extract_method[0m:[36m145[0m - [1mSetting keywords metadata extraction method to citoid...[0m
[32m2024-02-02 16:15:41.327[0m | [1mINFO    [0m | [36mdesci_sense.parsers.multi_stage_parser[0m:[36minit_keyword_extraction_chain[0m:[36m265[0m - [1mLoading keyword model (type=mistralai/mistral-7b-instruct)...[0m
                    h

In [4]:
post_url = "https://twitter.com/jasonhickel/status/1753519824754311466"
# ref_post = scrape_post(post_url)
# ref_post.to_json()



In [5]:
# from desci_sense.web_extractors.metadata_extractors import MetadataExtractionType, RefMetadata, extract_metadata_by_type, extract_all_metadata_by_type

In [6]:
# extract_all_metadata_by_type(ref_post.ref_urls, parser.kw_md_extract_method)

In [7]:
result = parser.kw_process_post(post_url)
result

{'post': RefPost(author='Jason Hickel', content="This new study finds that US Americans prefer workplace democracy (where workers own shares, are represented on boards, and elect their managers), even while recognizing this requires more responsibility. It's a core socialist policy and it's popular. https://www.cambridge.org/core/journals/american-political-science-review/article/what-do-americans-want-from-private-government-experimental-evidence-demonstrates-that-americans-want-workplace-democracy/D9C1DBB6F95D9EEA35A34ABF016511F4", url='https://twitter.com/jasonhickel/status/1753519824754311466', created_at=datetime.datetime(2024, 2, 2, 20, 44, 39, tzinfo=datetime.timezone.utc), metadata={'communityNote': None, 'conversationID': '1753519824754311466', 'date': 'Fri Feb 02 20:44:39 +0000 2024', 'date_epoch': 1706906679, 'hashtags': [], 'likes': 21, 'mediaURLs': [], 'media_extended': [], 'possibly_sensitive': False, 'qrtURL': None, 'replies': 0, 'retweets': 7, 'text': "This new study fi

In [8]:
print(result["full_prompt"])

You are an expert annotator tasked with assigning keywords to social media posts. The keywords should represent the most salient topics discussed by the post.   The post also includes references to external content. Details about the external references will be provided alongside the input post under "Reference Metadata". The keywords should also represent the external references! 

Rules:
- You should choose up to 6 keywords, not more!
- Keywords should be prefixed with a hashtag, e.g., #AI
- Your final answer should be structured as follows:
    - Reasoning Steps: (your reasoning steps)
    - Candidate Keywords: (For potential each keyword you choose, explain why you chose it.)
    - Final Answer: (a set of 6 final keywords, based on the Candidate Keywords. The final keywords must be included in the Candidate Keywords list!)


# Input post text:
Author: Jason Hickel
Content: This new study finds that US Americans prefer workplace democracy (where workers own shares, are represented o

In [9]:
print(result["answer"]["valid_keywords"])

['ExperimentalStudy,', 'WorkplaceDemocracy,', 'Dictatorship.', 'CorporateRegimeType,', 'Socialism,', 'PrivateGovernment,']


In [10]:
text = """Our new paper describes that "The Identity-based Model of Political Belief" and explains how social identity shapes information processing and contributes to the belief and spread of #misinformation
Partisanship involves cognitive and motivational aspects that shape party members' beliefs and actions. This includes whether they seek further evidence, where they seek that evidence, and which sources they trust. 
Understanding the interplay between social identity and accuracy is crucial in addressing misinformation.
To read the full paper:  https://www.sciencedirect.com/science/article/pii/S2352250X23002324"""
ref_post = convert_text_to_ref_post(text)

In [11]:
result = parser.extract_post_topics_w_metadata(ref_post)
result

{'post': RefPost(author='deafult_author', content='Our new paper describes that "The Identity-based Model of Political Belief" and explains how social identity shapes information processing and contributes to the belief and spread of #misinformation\nPartisanship involves cognitive and motivational aspects that shape party members\' beliefs and actions. This includes whether they seek further evidence, where they seek that evidence, and which sources they trust. \nUnderstanding the interplay between social identity and accuracy is crucial in addressing misinformation.\nTo read the full paper:  https://www.sciencedirect.com/science/article/pii/S2352250X23002324', url='', source_network='default_source', ref_urls=['https://www.sciencedirect.com/science/article/pii/S2352250X23002324']),
 'full_prompt': 'You are an expert annotator tasked with assigning keywords to social media posts. The keywords should represent the most salient topics discussed by the post.   The post also includes refe

In [12]:
text = """Check out our new paper on identity politics!  https://www.sciencedirect.com/science/article/pii/S2352250X23002324"""
ref_post = convert_text_to_ref_post(text)

In [13]:
result = parser.extract_post_topics_w_metadata(ref_post)
result

{'post': RefPost(author='deafult_author', content='Check out our new paper on identity politics!  https://www.sciencedirect.com/science/article/pii/S2352250X23002324', url='', source_network='default_source', ref_urls=['https://www.sciencedirect.com/science/article/pii/S2352250X23002324']),
 'full_prompt': 'You are an expert annotator tasked with assigning keywords to social media posts. The keywords should represent the most salient topics discussed by the post.   The post also includes references to external content. Details about the external references will be provided alongside the input post under "Reference Metadata". The keywords should also represent the external references! \n\nRules:\n- You should choose up to 6 keywords, not more!\n- Keywords should be prefixed with a hashtag, e.g., #AI\n- Your final answer should be structured as follows:\n    - Reasoning Steps: (your reasoning steps)\n    - Candidate Keywords: (For potential each keyword you choose, explain why you chose 

In [17]:
print(f"post: {result['post'].content}\n\nExtracted Keywords: {result['answer']['valid_keywords']}")

post: Check out our new paper on identity politics!  https://www.sciencedirect.com/science/article/pii/S2352250X23002324

Extracted Keywords: ['SocialIdentity', 'Misinformation', 'PartisanIdentity', 'DemocraticSocieties', 'IdentityPolitics', 'AccuracyGoals']
