In [1]:
import nest_asyncio
nest_asyncio.apply()
from pathlib import Path

import sys
sys.path.append("../")

from desci_sense.shared_functions.init import init_multi_chain_parser_config
from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser
from desci_sense.evaluation.utils import get_dataset

from desci_sense.shared_functions.dataloaders import (
    scrape_post,
    convert_text_to_ref_post,
)

In [2]:
config = init_multi_chain_parser_config(llm_type="google/gemma-7b-it",
                                        post_process_type="combined")

In [3]:
multi_chain_parser = MultiChainParser(config)

[32m2024-05-06 14:24:17.604[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m66[0m - [1mInitializing MultiChainParser. PostProcessType=combined[0m
[32m2024-05-06 14:24:17.613[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36m__init__[0m:[36m73[0m - [1mInitializing post parsers...[0m
[32m2024-05-06 14:24:17.613[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'refs_tagger' [0m
[32m2024-05-06 14:24:17.675[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'multi_refs_tagger' [0m
[32m2024-05-06 14:24:17.691[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.post_parser_chain[0m:[36m__init__[0m:[36m26[0m - [1mInitializing parser chain 'topics' [0m

In [4]:
# example raw text posts
TEST_POST_TEXT_W_REF = """
I really liked this paper!
https://arxiv.org/abs/2402.04607
I really disagree with this paper!
https://mastodon.social/@UlrikeHahn@fediscience.org/111732713776994953
"""

TEST_POST_2_TEXT_W_REF = """
I really disagree with this paper!
https://arxiv.org/abs/2402.04607
"""

TEXT_POSTS = [TEST_POST_TEXT_W_REF, TEST_POST_2_TEXT_W_REF]

In [5]:
# convert raw text to RefPost format for input to the parser
inputs = [convert_text_to_ref_post(p) for p in TEXT_POSTS]

In [12]:
# or scrape posts by url
urls = [
        "https://mastodon.social/@psmaldino@qoto.org/111405098400404613",
        "https://mastodon.social/@UlrikeHahn@fediscience.org/111732713776994953",
        "https://mastodon.social/@ronent/111687038322549430",
    ]
posts = [scrape_post(url) for url in urls]

In [10]:
# batch process
results = multi_chain_parser.batch_process_ref_posts(inputs,active_list=['multi_refs_tagger'],batch_size=10)

[32m2024-05-06 14:30:13.143[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m111[0m - [34m[1mtarget_url=https://arxiv.org/abs/2402.04607[0m
[32m2024-05-06 14:30:13.144[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m111[0m - [34m[1mtarget_url=https://mastodon.social/redirect/statuses/111732713776994953[0m
[32m2024-05-06 14:30:15.477[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m232[0m - [34m[1mProcessing 2 posts with parsers: ['multi_refs_tagger'][0m
[32m2024-05-06 14:30:15.479[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m234[0m - [34m[1mInstantiating prompts...[0m


  0%|          | 0/2 [00:00<?, ?it/s]

[32m2024-05-06 14:30:15.496[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m245[0m - [34m[1mInvoking parallel chain...[0m
{
"answer": {
    "sub_answers": [
        {
            "reasoning_steps": "The author clearly expresses disagreement with the conclusions of the referenced paper. The post contains a strong negative sentiment.",
            "candidate_tags": [
                "disagrees",
                "discussion"
            ],
            "final_answer": ["disagrees"]
        }
    ]
}
```[0m
[32m2024-05-06 14:30:20.082[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m256[0m - [34m[1mPost processing 2 results...[0m
[32m2024-05-06 14:30:20.086[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m268[0m - [34m[1mDone!

In [6]:
#
url = "https://twitter.com/mbauwens/status/1779543397528740338"
post = scrape_post(url)

In [7]:
# single process
result = multi_chain_parser.process_ref_post(post)

[32m2024-04-30 15:42:45.141[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://icandecide.org/v-safe-data/[0m
[32m2024-04-30 15:42:45.164[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://icandecide.org/pfizer-documents/[0m
[32m2024-04-30 15:43:04.309[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m184[0m - [34m[1mProcessing post with parsers: ['refs_tagger', 'topics', 'keywords', 'hashtags'][0m
[32m2024-04-30 15:43:04.310[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mprocess_ref_post[0m:[36m186[0m - [34m[1mInstantiating prompts...[0m
[32m2024-04-30 15:43:04.311[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_funct

In [8]:
result.topics

['sustainability',
 'software & hardware',
 'academia',
 'climate',
 'news',
 'health',
 'technology',
 'research']

In [16]:
result.filter_classification

<SciFilterClassfication.RESEARCH: 'research'>

In [43]:
results[0].reference_tagger

['review', 'disagrees', 'agrees']

In [19]:
result.topics

['economics', 'research']

In [13]:
print(results[0])

research_keyword='not-detected' filter_classification=<SciFilterClassfication.RESEARCH: 'research'> item_types=['preprint', 'webpage'] reference_urls=['https://arxiv.org/abs/2402.04607', 'https://mastodon.social/redirect/statuses/111732713776994953'] reference_tagger=None multi_reference_tagger=[['agrees'], ['disagrees']] keywords=[] topics=[] hashtags=[] metadata_list=[RefMetadata(citoid_url='http://arxiv.org/abs/2402.04607', url='https://arxiv.org/abs/2402.04607', item_type='preprint', title='Google Scholar is manipulatable', summary="Citations are widely considered in scientists' evaluation. As such, scientists may be incentivized to inflate their citation counts. While previous literature has examined self-citations and citation cartels, it remains unclear whether scientists can purchase citations. Here, we compile a dataset of ~1.6 million profiles on Google Scholar to examine instances of citation fraud on the platform. We survey faculty at highly-ranked universities, and confirm

In [14]:
import wandb
import pandas as pd

wandb.login()

api = wandb.Api()

#TODO move from testing
run = wandb.init(project="testing", job_type="evaluation")

# get artifact path

dataset_artifact_id = (
        'common-sense-makers/filter_evaluation/labeled_tweets_no_threads:v1'
    )

# set artifact as input artifact
dataset_artifact = run.use_artifact(dataset_artifact_id)

# initialize table path
# add the option to call table_path =  arguments.get('--dataset')

# download path to table
a_path = dataset_artifact.download()
print("The path is",a_path)

# get dataset file name

table_path = Path(f"{a_path}/labeled_data_table_no_threads.table.json")


# return the pd df from the table
#remember to remove the head TODO
df = get_dataset(table_path)

table_path = Path(f"{a_path}/handles_chart.table.json")

df_handles = get_dataset(table_path)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshahar-r-oriel[0m ([33mcommon-sense-makers[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   2 of 2 files downloaded.  


The path is /Users/shaharorielkagan/sensemakers/nlp/notebooks/artifacts/labeled_tweets_no_threads:v1


In [15]:
import concurrent.futures
from tqdm import tqdm

def process_text(text):
    return convert_text_to_ref_post(text)

# Assuming df['Text'] is your DataFrame column
with concurrent.futures.ThreadPoolExecutor() as executor:
    inputs = list(tqdm(executor.map(process_text, df['Text']), total=len(df['Text'])))


100%|██████████| 467/467 [00:49<00:00,  9.41it/s]


In [16]:
inputs = inputs[:10]

In [31]:
# batch process
results = multi_chain_parser.batch_process_ref_posts(inputs,active_list=['multi_refs_tagger'],batch_size=10)

[32m2024-05-06 15:52:15.512[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m111[0m - [34m[1mtarget_url=https://civilizationemerging.com/[0m
[32m2024-05-06 15:52:15.526[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m111[0m - [34m[1mtarget_url=https://en.wikipedia.org/wiki/Epic_of_evolution[0m
[32m2024-05-06 15:52:15.528[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m111[0m - [34m[1mtarget_url=https://wiki.p2pfoundation.net/Category:Peerproduction#With_the_advent_of_the_P2P_Mode_of_Production%2C_the_community_and_its_common_is_now_the_appropriate_scale[0m
[32m2024-05-06 15:52:15.536[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m111[0m - [34

  0%|          | 0/10 [00:00<?, ?it/s]

[32m2024-05-06 15:52:22.695[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.parsers.multi_chain_parser[0m:[36mbatch_process_ref_posts[0m:[36m245[0m - [34m[1mInvoking parallel chain...[0m

sub_answer = SubAnswer()


sub_answer.reasoning_steps = "The post describes a quote of the day that talks about the multiplicity of ownership designs and the need to ecologize finance. The post also discusses the importance of redesigning social architectures and making finance more inclusive. The post references a category page on the P2P Foundation website. The post quotes text from the referenced page. Therefore, the tags that best characterize the post's relation to the reference are 'quote' and 'reference'."


sub_answer.candidate_tags = ["quote", "reference", "ecologize", "inclusive", "social architectures"]


sub_answer.final_answer = ["quote", "reference"]


answer = Answer()


answer.sub_answers = [sub_answer]
```

**Final Answer:**

```
{
  "sub_answers": [
    {
   

In [19]:
df=df[:10]

In [30]:
df["Predicted Labels"] = [x.multi_reference_tagger for x in results]
df["urls"] = [x.reference_urls for x in results]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Predicted Labels"] = [x.multi_reference_tagger for x in results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["urls"] = [x.reference_urls for x in results]


[['https://en.wikipedia.org/wiki/Epic_of_evolution', 'https://en.wikipedia.org/wiki/Epic_of_evolution'], ['https://wiki.p2pfoundation.net/Category:Thermodynamic_Efficiencies'], ['https://lifehacker.com/tech/ai-is-running-out-of-internet'], ['https://www.journaloffreespeechlaw.org/'], ['https://wiki.p2pfoundation.net/Andrew_Targowski%27s_Classification_of_the_Civilizational_Approaches_To_Human_History'], ['https://civilizationemerging.com/', 'http://emergentcivilization.com/'], ['https://integralpermaculture.wordpress.com/peter-pogany/', 'https://integralpermaculture.wordpress.com/peter-pogany/'], ['https://wiki.p2pfoundation.net/Category:Peerproduction#With_the_advent_of_the_P2P_Mode_of_Production,_the_community_and_its_common_is_now_the_appropriate_scale'], ['https://wiki.p2pfoundation.net/Category:Peerproperty#Short_Citations'], ['https://autocatallaxy.com/']]
+++++
[['https://en.wikipedia.org/wiki/Epic_of_evolution', 'https://en.wikipedia.org/wiki/Epic_of_evolution'], ['https://wiki