In [2]:
import nest_asyncio
nest_asyncio.apply()
from pathlib import Path

import sys
sys.path.append("../")
from typing import Optional, Union, List
import re
import requests
from datetime import datetime

from desci_sense.shared_functions.dataloaders import (
    scrape_post,
    convert_text_to_ref_post,
)
from desci_sense.shared_functions.configs import MetadataExtractionType
from desci_sense.shared_functions.web_extractors.metadata_extractors import extract_all_metadata_to_dict
from desci_sense.shared_functions.dataloaders.twitter.twitter_utils import convert_vxtweet_to_quote_ref_post, convert_vxtweet_to_ref_post
from desci_sense.shared_functions.schema.post import QuoteRefPost, ThreadRefPost, RefPost
from desci_sense.shared_functions.interface import ThreadPostInterface
from desci_sense.shared_functions.utils import remove_dups_ordered, find_last_occurence_of_any, extract_and_expand_urls, extract_external_urls_from_status_tweet

In [3]:
MULTI_QUOTE_THREAD = {
    "url": "https://x.com/JohnDoe/status/1798166864398271",
    "content": "After careful consideration, the FDA advisory commission voted today 9:2 that MDMA has *not* been shown to be effective for treating PTSD, given massive concerns around validity threats in this literature. They also voted 10:1 that MDMA has *not* shown to be safe. https://twitter.com/FDAadcomms/status/1798104612635070611\n---\n📄Many mentioned reasons overlap with those we summarized recently in our review paper: \nhttps://journals.sagepub.com/doi/10.1177/20451253231198466\n\n📺 I also summarize them for a lay audience in this YouTube video: \nhttps://youtu.be/WknlkmJee4E?si=kjMtNR1Hwe7NZ8as\n---\nSome pretty wild things in the meeting honestly, thanks to @JaneSmith for live tweeting.\n\nEg folks who were paid by the sponsor (big pharma) to speak on behalf of the product to be marketed did *not* have to declare they were being paid.\n---\n@JaneSmith Here is the full thread: https://twitter.com/JaneSmith/status/1798046087737180395\n---\n@JaneSmith Here the second vote on benefits and risks: https://twitter.com/FDAadcomms/status/1798107142219796794",
    "author": {
        "id": "16861042",
        "name": "John Doe",
        "username": "JohnDoe",
        "platformId": "twitter",
    },
    "quotedPosts": [
        {
            "url": "https://twitter.com/FDAadcomms/status/1798104612635070611",
            "content": "@JaneSmith #MDMAadcomm VOTE 1/2: Do the available data show that the drug is effective in patients with posttraumatic\nstress disorder?\n2-Yes\n9-No\n0-Abstain https://twitter.com/FDAadcomms/status/1798104612635070611/photo/1",
            "author": {
                "id": "148067168202752",
                "name": "FDAadcomms",
                "username": "FDAadcomms",
                "platformId": "twitter",
            },
        },
        {
            "url": "https://twitter.com/JaneSmith/status/1798046087737180395",
            "content": 'Next up on the AdComm agenda, when they come back from lunch at the top of hour, is the Open Public Hearing. For reasons mentioned below, don\'t be surprised if the "public" consists more of advocates for approval, and we hear from relatively few with reservations. https://twitter.com/JaneSmith/status/1797349211849245178',
            "author": {
                "id": "42893834",
                "name": "Jane Smith",
                "username": "JaneSmith",
                "platformId": "twitter",
            },
        },
        {
            "url": "https://twitter.com/FDAadcomms/status/1798107142219796794",
            "content": "@JaneSmith #MDMAadcomm VOTE 2/2: Do the benefits of midomafetamine with FDA’s proposed risk evaluation and mitigation strategy (REMS) outweigh its risks for the treatment of patients with PTSD?\n1-Yes\n10-No\n0-Abstain https://twitter.com/FDAadcomms/status/1798107142219796794/photo/1",
            "author": {
                "id": "1480844662752",
                "name": "FDAadcomms",
                "username": "FDAadcomms",
                "platformId": "twitter",
            },
        },
    ],
}

In [4]:
thread_interface = ThreadPostInterface.model_validate(MULTI_QUOTE_THREAD)

In [5]:
thread_posts_content = thread_interface.content.split("\n---\n")
thread_posts_content

['After careful consideration, the FDA advisory commission voted today 9:2 that MDMA has *not* been shown to be effective for treating PTSD, given massive concerns around validity threats in this literature. They also voted 10:1 that MDMA has *not* shown to be safe. https://twitter.com/FDAadcomms/status/1798104612635070611',
 '📄Many mentioned reasons overlap with those we summarized recently in our review paper: \nhttps://journals.sagepub.com/doi/10.1177/20451253231198466\n\n📺 I also summarize them for a lay audience in this YouTube video: \nhttps://youtu.be/WknlkmJee4E?si=kjMtNR1Hwe7NZ8as',
 'Some pretty wild things in the meeting honestly, thanks to @JaneSmith for live tweeting.\n\nEg folks who were paid by the sponsor (big pharma) to speak on behalf of the product to be marketed did *not* have to declare they were being paid.',
 '@JaneSmith Here is the full thread: https://twitter.com/JaneSmith/status/1798046087737180395',
 '@JaneSmith Here the second vote on benefits and risks: htt

In [6]:
# create dict of quote posts keyed by url

converted_quoted_posts = [RefPost.from_basic_post_interface(post) for post \
                           in thread_interface.quotedPosts]
quote_post_dict = {p.url: p for p in converted_quoted_posts}

In [7]:
# create QuoteRefPosts from each post in thread
quote_ref_posts = []
for post_content in thread_posts_content:
    quoted_post_url = find_last_occurence_of_any(post_content, quote_post_dict.keys())
    quoted_post = quote_post_dict.get(quoted_post_url, None)
    quote_ref_post = QuoteRefPost(
        author=thread_interface.author.name,
        url=thread_interface.url,
        content=post_content,
        ref_urls=extract_and_expand_urls(post_content),
        quoted_post=quoted_post,
    )
    quote_ref_posts.append(quote_ref_post)


In [8]:
quote_post_dict

{'https://twitter.com/FDAadcomms/status/1798104612635070611': RefPost(author='FDAadcomms', content='@JaneSmith #MDMAadcomm VOTE 1/2: Do the available data show that the drug is effective in patients with posttraumatic\nstress disorder?\n2-Yes\n9-No\n0-Abstain https://twitter.com/FDAadcomms/status/1798104612635070611/photo/1', url='https://twitter.com/FDAadcomms/status/1798104612635070611', source_network='twitter'),
 'https://twitter.com/JaneSmith/status/1798046087737180395': RefPost(author='Jane Smith', content='Next up on the AdComm agenda, when they come back from lunch at the top of hour, is the Open Public Hearing. For reasons mentioned below, don\'t be surprised if the "public" consists more of advocates for approval, and we hear from relatively few with reservations. https://twitter.com/JaneSmith/status/1797349211849245178', url='https://twitter.com/JaneSmith/status/1798046087737180395', source_network='twitter', ref_urls=['https://twitter.com/JaneSmith/status/179734921184924517

In [8]:
quote_ref_posts[0].quoted_post

RefPost(author='FDAadcomms', content='@JaneSmith #MDMAadcomm VOTE 1/2: Do the available data show that the drug is effective in patients with posttraumatic\nstress disorder?\n2-Yes\n9-No\n0-Abstain https://twitter.com/FDAadcomms/status/1798104612635070611/photo/1', url='https://twitter.com/FDAadcomms/status/1798104612635070611', source_network='twitter', ref_urls=['https://twitter.com/FDAadcomms/status/1798104612635070611/photo/1'])

In [9]:
quote_ref_posts[3].quoted_post

RefPost(author='Jane Smith', content='Next up on the AdComm agenda, when they come back from lunch at the top of hour, is the Open Public Hearing. For reasons mentioned below, don\'t be surprised if the "public" consists more of advocates for approval, and we hear from relatively few with reservations. https://twitter.com/JaneSmith/status/1797349211849245178', url='https://twitter.com/JaneSmith/status/1798046087737180395', source_network='twitter', ref_urls=['https://twitter.com/JaneSmith/status/1797349211849245178'])

In [10]:
quote_ref_posts[4].quoted_post

RefPost(author='FDAadcomms', content='@JaneSmith #MDMAadcomm VOTE 2/2: Do the benefits of midomafetamine with FDA’s proposed risk evaluation and mitigation strategy (REMS) outweigh its risks for the treatment of patients with PTSD?\n1-Yes\n10-No\n0-Abstain https://twitter.com/FDAadcomms/status/1798107142219796794/photo/1', url='https://twitter.com/FDAadcomms/status/1798107142219796794', source_network='twitter', ref_urls=['https://twitter.com/FDAadcomms/status/1798107142219796794/photo/1'])

In [9]:
all_ref_urls = []
for p in quote_ref_posts:
    all_ref_urls += p.ref_urls

In [10]:
thread_ref_post = ThreadRefPost(author=thread_interface.author.name,
                                url=thread_interface.url,
                                content=thread_interface.content,
                                source_network=thread_interface.author.platformId,
                                ref_urls=all_ref_urls,
                                posts=quote_ref_posts
                                )

In [11]:
thread_ref_post.md_ref_urls()

['https://twitter.com/FDAadcomms/status/1798104612635070611',
 'https://journals.sagepub.com/doi/10.1177/20451253231198466',
 'https://www.youtube.com/watch?feature=youtu.be&si=kjMtNR1Hwe7NZ8as&v=WknlkmJee4E',
 'https://twitter.com/JaneSmith/status/1798046087737180395',
 'https://twitter.com/JaneSmith/status/1797349211849245178',
 'https://twitter.com/FDAadcomms/status/1798107142219796794']

In [2]:
tweet_url = "https://x.com/StephensonJones/status/1799035911042482210"
quote_ref_post = scrape_post(tweet_url)

In [4]:
# create dict of metadata
md_dict = extract_all_metadata_to_dict(ordered_refs, MetadataExtractionType.CITOID, 500 )

[32m2024-06-12 13:17:56.505[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m104[0m - [34m[1mskipping citoid for https://x.com/biorxiv_neursci/status/1798962015148576815[0m
[32m2024-06-12 13:17:56.506[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async_retry[0m:[36m111[0m - [34m[1mtarget_url=https://www.biorxiv.org/content/10.1101/2024.06.05.597547v1[0m
