In [26]:
import sys
sys.path.append("../")

import nest_asyncio
nest_asyncio.apply()

from urllib.parse import urlparse
import pandas as pd
from typing import List, Dict
from mastodon import Mastodon
from datetime import datetime, time

from desci_sense.shared_functions.schema.post import RefPost
from desci_sense.shared_functions.postprocessing import StreamlitParserResults
from desci_sense.shared_functions.postprocessing.output_parsers import AllowedTermsParser, ALLOWED_TAGS_DELIMITER
from desci_sense.shared_functions.utils import flatten
from desci_sense.shared_functions.web_extractors.metadata_extractors import (extract_all_metadata_by_type, MetadataExtractionType, RefMetadata, extract_all_metadata_to_dict,)
from desci_sense.shared_functions.dataloaders.mastodon.mastodon_loader import MastodonLoader

from langchain_core.runnables import RunnableParallel

from desci_sense.configs import default_init_parser_config
from desci_sense.shared_functions.dataloaders import scrape_post
from desci_sense.shared_functions.parsers.firebase_api_parser import FirebaseAPIParser, PromptCase

In [17]:
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 1, 30)
mloader = MastodonLoader()
acct = "@ronent@mastodon.social"
posts = mloader.load_profile_timeline(
    acct,
    max_toots=30,
    start_date=start_date,
    end_date=end_date,
    exclude_replies=True,
    exclude_reposts=True,
)
len(posts)

2

In [18]:
config = default_init_parser_config(semantics_model="mistralai/mistral-7b-instruct",
                                    kw_model="mistralai/mistral-7b-instruct")


In [19]:
parser = FirebaseAPIParser(config=config)

[32m2024-03-19 19:01:03.663[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36mset_md_extract_method[0m:[36m155[0m - [1mSetting metadata extraction method to none...[0m
[32m2024-03-19 19:01:03.664[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36m__init__[0m:[36m115[0m - [1mLoading parser model (type=mistralai/mistral-7b-instruct)...[0m
[32m2024-03-19 19:01:03.701[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36mset_kw_md_extract_method[0m:[36m159[0m - [1mSetting keywords metadata extraction method to citoid...[0m
[32m2024-03-19 19:01:03.701[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36minit_keyword_extraction_chain[0m:[36m449[0m - [1mLoading keyword model (type=mistralai/mistral-7b-instruct)...[0m
[32m2024-03-19 19:01:03.731[0m | [1mINFO    [0m | [36mdesci_sense.shared_fu

In [20]:
parser.set_md_extract_method("citoid")

[32m2024-03-19 19:01:06.216[0m | [1mINFO    [0m | [36mdesci_sense.shared_functions.parsers.firebase_api_parser[0m:[36mset_md_extract_method[0m:[36m155[0m - [1mSetting metadata extraction method to citoid...[0m


In [21]:
results = await parser.abatch_process_ref_post(posts)

[32m2024-03-19 19:01:09.916[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://paragraph.xyz/@sense-nets/sense-nets-intro[0m
[32m2024-03-19 19:01:09.916[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://paragraph.xyz/@sense-nets/2-project-plan[0m
[32m2024-03-19 19:01:09.917[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://arxiv.org/abs/2401.13782[0m
[32m2024-03-19 19:01:09.917[0m | [34m[1mDEBUG   [0m | [36mdesci_sense.shared_functions.web_extractors.citoid[0m:[36mfetch_citation_async[0m:[36m20[0m - [34m[1mtarget_url=https://twitter.com/deliprao/status/1750732070014337101[0m


In [22]:
len(results)

2

In [23]:
def render_posts_to_df(posts: List[RefPost]) -> pd.DataFrame:
    """
    Renders posts as dataframe with columns `author`, `url`, `content`, and `created_at` fields.
    """
    # Create a list of dictionaries, each representing a row in the resulting DataFrame
    data = [
        {
            "author": post.author,
            "content": post.content,
            "url": post.url,
            "created_at": post.created_at,
        }
        for post in posts
    ]

    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(data)

    return df

In [24]:
df = render_posts_to_df(posts)
df

Unnamed: 0,author,content,url,created_at
0,Ronen Tamari,Also relevant for discussions about science so...,https://mastodon.social/@ronent/11182257120421...,2024-01-26 13:50:26.443000+00:00
1,Ronen Tamari,"*crosspost from birdsite* \nNew year, new way...",https://mastodon.social/@ronent/11168703832254...,2024-01-02 15:22:38.781000+00:00


In [25]:
df["science_filter"] = [r.research_filter for r in results]

df

Unnamed: 0,author,content,url,created_at,science_filter
0,Ronen Tamari,Also relevant for discussions about science so...,https://mastodon.social/@ronent/11182257120421...,2024-01-26 13:50:26.443000+00:00,not-detected
1,Ronen Tamari,"*crosspost from birdsite* \nNew year, new way...",https://mastodon.social/@ronent/11168703832254...,2024-01-02 15:22:38.781000+00:00,academic


In [29]:
df["reference_urls"] = [r.reference_urls for r in results]

In [30]:
df

Unnamed: 0,author,content,url,created_at,science_filter,research_filter,item_types,reference_urls
0,Ronen Tamari,Also relevant for discussions about science so...,https://mastodon.social/@ronent/11182257120421...,2024-01-26 13:50:26.443000+00:00,not-detected,not-detected,,"[https://arxiv.org/abs/2401.13782, https://twi..."
1,Ronen Tamari,"*crosspost from birdsite* \nNew year, new way...",https://mastodon.social/@ronent/11168703832254...,2024-01-02 15:22:38.781000+00:00,academic,,,[https://paragraph.xyz/@sense-nets/sense-nets-...


In [None]:
# def add_results_to_df(df: pd.DataFrame, results: List[StreamlitParserResults]) -> pd.DataFrame:
    # """
    # Add columns to the dataframe `df` corresponding to each `StreamlitParserResults` field.
    # Each row in `results` corresponds to a row in `df` and should extend it by adding those columns
    # """

In [27]:
def add_results_to_df(df: pd.DataFrame, results: List[StreamlitParserResults]) -> pd.DataFrame:
    # Check if df is not empty and has the same number of rows as the length of results
    if not df.empty and len(df) == len(results):
        # Iterate through each result and the corresponding index
        for idx, result in enumerate(results):
            # For each attribute in result, add it as a new column in df
            for field in result.model_dump().keys():
                if field != "debug":
                    df.loc[idx, field] = result.model_dump()[field]
    else:
        # Handling case where df is empty or row counts do not match
        # This could be an error or you might want to append rows based on results
        raise ValueError("DataFrame is empty or does not match the number of results provided.")

    return df

In [28]:
new_df = add_results_to_df(df, results)

ValueError: Must have equal len keys and value when setting with an iterable

In [36]:
def add_results_to_df(df: pd.DataFrame, results: List[StreamlitParserResults]) -> pd.DataFrame:
    # Ensure the DataFrame and results list have the same length
    if not df.empty and len(df) == len(results):
        # Iterate over the schema of StreamlitParserResults to get all field names
        for field_name in StreamlitParserResults.model_fields.keys():
            if field_name != "debug":
                # For each field, create a new column in df with the values from results
                df[field_name] = [getattr(r, field_name) for r in results]
    else:
        raise ValueError("DataFrame is empty or does not match the number of results provided.")
    
    return df

In [37]:
new_df = add_results_to_df(df, results)
new_df

Unnamed: 0,author,content,url,created_at,science_filter,research_filter,item_types,reference_urls,semantic_tags,keywords,debug
0,Ronen Tamari,Also relevant for discussions about science so...,https://mastodon.social/@ronent/11182257120421...,2024-01-26 13:50:26.443000+00:00,not-detected,not-detected,"[preprint, unknown]","[https://arxiv.org/abs/2401.13782, https://twi...",[discussion],"[citation-counts, nanopub, social-media, resea...",{'semantics': {'prompt': ' You are an expert a...
1,Ronen Tamari,"*crosspost from birdsite* \nNew year, new way...",https://mastodon.social/@ronent/11168703832254...,2024-01-02 15:22:38.781000+00:00,academic,academic,"[webpage, webpage]",[https://paragraph.xyz/@sense-nets/sense-nets-...,[call-for-papers],"[FragmentedScienceSocialMedia, ScienceSocialMe...",{'semantics': {'prompt': ' You are an expert a...
