In [1]:
# load env with api keys https://stackoverflow.com/a/54028874
%load_ext dotenv
%dotenv ../etc/config.env

import sys
sys.path.append("../")

from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema import BaseOutputParser
from langchain.schema import (
    HumanMessage,
)



from desci_sense.twitter import scrape_tweet

from desci_sense.postprocessing.output_parsers import TypeTagParser

In [2]:
TEST_TWEET_1 = "https://twitter.com/danwilliamsphil/status/1719436704602275858"
TEST_TWEET_2 = "https://twitter.com/pwang/status/1719720728184910195"
TEST_TWEET_3 = "https://twitter.com/BlancheMinerva/status/1719714881081954409"
TEST_TWEET_4 = "https://twitter.com/sucholutsky/status/1719725087681569189"
TEST_TWEET_5 = "https://twitter.com/TBSocialist/status/1719676110785421807"
TEST_TWEET_6 = "https://twitter.com/DG_Rand/status/1719372459344888032"
TEST_TWEET_7 = "https://twitter.com/yanaiela/status/1719755578409619740"
TEST_RT_TWEET = "https://twitter.com/IntuitMachine/status/1720443327663714559"

In [3]:
tweet = scrape_tweet(TEST_TWEET_7)
tweet

{'conversationID': '1719755578409619740',
 'date': 'Wed Nov 01 16:37:36 +0000 2023',
 'date_epoch': 1698856656,
 'hashtags': [],
 'likes': 68,
 'mediaURLs': ['https://pbs.twimg.com/media/F93KgeQbEAABmcy.png'],
 'media_extended': [{'altText': None,
   'size': {'height': 415, 'width': 835},
   'thumbnail_url': 'https://pbs.twimg.com/media/F93KgeQbEAABmcy.png',
   'type': 'image',
   'url': 'https://pbs.twimg.com/media/F93KgeQbEAABmcy.png'}],
 'possibly_sensitive': False,
 'qrtURL': None,
 'replies': 1,
 'retweets': 31,
 'text': "What's In My Big Data?\n\nA question we've been asking ourselves for a while.\nHere is our attempt to answer it.\n\n🧵\nPaper - \nhttps://arxiv.org/abs/2310.20707\n\nDemo-\nhttps://wimbd.apps.allenai.org/ https://t.co/ViP84AsKCY",
 'tweetID': '1719755578409619740',
 'tweetURL': 'https://twitter.com/yanaiela/status/1719755578409619740',
 'user_name': 'Yanai Elazar',
 'user_screen_name': 'yanaiela'}

In [4]:
# Create model
from desci_sense.configs import environ

OPENROUTER_API_BASE = "https://openrouter.ai/api/v1"
# openai.api_key = environ["OPENROUTER_API_KEY"]

model_name = "mistralai/mistral-7b-instruct" # currently free on OpenRouter (https://openrouter.ai/docs#models)
# model_name = "openai/gpt-3.5-turbo"


chat = ChatOpenAI(
        model=model_name, 
        temperature=0.6,
        openai_api_key=environ["OPENROUTER_API_KEY"],
        openai_api_base=OPENROUTER_API_BASE,
        headers={"HTTP-Referer": environ["OPENROUTER_REFERRER"]}, # To identify your app. Can be set to e.g. http://localhost:3000 for testing
    )

                    headers was transferred to model_kwargs.
                    Please confirm that headers is what you intended.


In [15]:
# based on https://python.langchain.com/docs/get_started/quickstart#prompttemplate--llm--outputparser

class CommaSeparatedListOutputParser(BaseOutputParser):
    """Parse the output of an LLM call to a comma-separated list."""


    def parse(self, text: str):
        """Parse the output of an LLM call."""
        return text.strip().split(", ")
    
class StripOutputParser(BaseOutputParser):
    """Parse the output of an LLM call to a comma-separated list."""


    def parse(self, text: str):
        """Parse the output of an LLM call."""
        return text.strip()

template = """You are an expert annotator who tags social media posts related to academic research, according to a predefined set of tags. 
The available tag types are:
<announce>: this post contains an announcement of new research. The announcement is likely made by the authors but may be a third party. The research should be a paper, dataset or other type of research output that is being announced publicly.
<review>: this post contains a review of another reference, such as a book, article or movie. The review could be positive or negative. A review can be detailed or a simple short endorsement.
<other>: This is a special tag. Use this tag if none of the tags above are suitable. If you tag a post with <other>, no other tag should be assigned to the post.

A user will pass in a post, and you should think step by step, before returning a list of comma separated tags that best match the post.

Your final answer should be structured as follows:
Reasoning Steps: (your reasoning steps)
Candidate Tags: (For potential each tag you choose, explain why you chose it.)
Final Answer: (a final list of tags, based on the Candidate Tags. Should include only Tags from the Candidate Tags list!)

Remember:
The final answer should ONLY include tags from the list above, nothing more. Do not make up any new tags that are not in the list above!
If the <other> tag is included in the answer, no other tag should be included!"""
human_template = "{text}"

# <endorsement>: this post is endorsing another piece of content, such as a book, article, podcast, or other post. An endorsement is less detailed than a review.

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])
# chain = chat_prompt | chat | CommaSeparatedListOutputParser()
chain = chat_prompt | chat | TypeTagParser()
answer = chain.invoke({"text": tweet["text"]})

for k,v in answer.items():
    print("{}: {} ".format(k,v))

reasoning: 1. Based on the post, it seems like the author is announcing a new research output.
2. The research output is a paper that is available at a URL.
3. There is also a demo associated with the paper.

Candidate Tags:

1. <announce>: This tag is suitable for an announcement of new research, which is what the post seems to be about.
2. <paper>: This tag is specific to research papers, which is what the post is about.

Final Answer: <announce>, <paper> 
candidates: 1. <announce>: This tag is suitable for an announcement of new research, which is what the post seems to be about.
2. <paper>: This tag is specific to research papers, which is what the post is about.

Final Answer: <announce>, <paper> 
final_answer: <announce>, <paper> 


In [18]:
print(answer['reasoning'])

1. Based on the post, it seems like the author is announcing a new research output.
2. The research output is a paper that is available at a URL.
3. There is also a demo associated with the paper.

Candidate Tags:

1. <announce>: This tag is suitable for an announcement of new research, which is what the post seems to be about.
2. <paper>: This tag is specific to research papers, which is what the post is about.

Final Answer: <announce>, <paper>


In [28]:
import re

# Sample text
text = """
Reasoning Steps:
1. Based on the post, it seems like the author is announcing a new research output.
2. The research output is a paper that is available at a URL.
3. There is also a demo associated with the paper.

Candidate Tags:

1. <announce>: This tag is suitable for an announcement of new research, which is what the post seems to be about.
2. <paper>: This tag is specific to research papers, which is what the post is about.

Final Answer: <announce>, <paper>
"""


# Define the regex patterns for the three sections

sections_re = re.compile(r"Reasoning Steps:(.*?)Candidate Tags:(.*?)Final Answer:(.*)", re.DOTALL)
# reasoning_steps_pattern = r'Reasoning Steps:(.*?)(?=(#|\Z))'
# candidate_tags_pattern = r'Candidate Tags:(.*?)(?=(#|\Z))'
# final_answer_pattern = r'Final Answer:(.*?)(?=(#|\Z))'

# Extract the content using regex
sections_match = sections_re.search(text)
# reasoning_steps = re.search(reasoning_steps_pattern, text, re.DOTALL).group(1).strip()
# candidate_tags = re.search(candidate_tags_pattern, text, re.DOTALL).group(1).strip()
# final_answer = re.search(final_answer_pattern, text, re.DOTALL).group(1).strip()

reasoning_steps = sections_match.group(1).strip()
candidate_tags = sections_match.group(2).strip()
final_answer = sections_match.group(3).strip()


# Combine into a tuple
# extracted_content = (reasoning_steps.strip(), candidate_tags.strip(), final_answer.strip())
extracted_content = {"reasoning": reasoning_steps.strip(), 
                             "candidates": candidate_tags.strip(), 
                             "final_answer": final_answer.strip()}
# print("Extracted Content Tuple:", extracted_content)

for k,v in extracted_content.items():
    print("{}: {}".format(k,v))




reasoning: 1. Based on the post, it seems like the author is announcing a new research output.
2. The research output is a paper that is available at a URL.
3. There is also a demo associated with the paper.
candidates: 1. <announce>: This tag is suitable for an announcement of new research, which is what the post seems to be about.
2. <paper>: This tag is specific to research papers, which is what the post is about.
final_answer: <announce>, <paper>


In [29]:
def convert_string_to_list(input_string):
    names_list = [name.strip() for name in input_string.split(',') if name.strip()]
    return names_list

# Example usage:
input_string = " <x> , <y> "
result_list = convert_string_to_list(input_string)
print(result_list)

['<x>', '<y>']
