In [1]:
# load env with api keys https://stackoverflow.com/a/54028874
%load_ext dotenv
%dotenv ../etc/config.env



In [2]:
import sys
sys.path.append("../")

import pandas as pd

from desci_sense.parsers.base_parser import BaseParser
from confection import Config
from desci_sense.configs import init_config

In [3]:
config = init_config(model_name = "mistralai/mistral-7b-instruct")

In [4]:

# model_name = "openai/gpt-3.5-turbo"

tweet_parser = BaseParser(config=config)

                    headers was transferred to model_kwargs.
                    Please confirm that headers is what you intended.


# Test prediction on single tweet

In [5]:
TEST_TWEET_7 = "https://twitter.com/bingbrunton/status/1719789465739333972"

In [8]:
result = tweet_parser.process_tweet_url(TEST_TWEET_7)

In [9]:
result

{'tweet': {'conversationID': '1719789465739333972',
  'date': 'Wed Nov 01 18:52:15 +0000 2023',
  'date_epoch': 1698864735,
  'hashtags': [],
  'likes': 123,
  'mediaURLs': ['https://pbs.twimg.com/media/F92_tj1asAAAoR-.jpg'],
  'media_extended': [{'altText': None,
    'size': {'height': 1279, 'width': 3199},
    'thumbnail_url': 'https://pbs.twimg.com/media/F92_tj1asAAAoR-.jpg',
    'type': 'image',
    'url': 'https://pbs.twimg.com/media/F92_tj1asAAAoR-.jpg'}],
  'possibly_sensitive': False,
  'qrtURL': None,
  'replies': 2,
  'retweets': 69,
  'text': 'My department at Univ Washington in Seattle is searching for a tenure-track assistant prof in "Quantitative Understanding of Collective Behavior" @UWBiology\n\nSee ad for more info about the search vision and to apply; happy to answer questions!\n\nhttps://apply.interfolio.com/130336 https://t.co/ydKZVuAeeY',
  'tweetID': '1719789465739333972',
  'tweetURL': 'https://twitter.com/bingbrunton/status/1719789465739333972',
  'user_name': '

In [10]:
print(result["answer"]["reasoning"])

[Reasoning Steps]

1. The post is about a job listing.
2. The job is for a tenure-track assistant professor in "Quantitative Understanding of Collective Behavior" at Univ Washington in Seattle.
3. The post includes a link to the job ad and encourages applicants to apply.

[Candidate Tags]

<job>
<other>


In [11]:
print(result["answer"]["final_answer"])

<job>


In [12]:
print(result["tweet"]["text"])

My department at Univ Washington in Seattle is searching for a tenure-track assistant prof in "Quantitative Understanding of Collective Behavior" @UWBiology

See ad for more info about the search vision and to apply; happy to answer questions!

https://apply.interfolio.com/130336 https://t.co/ydKZVuAeeY


# Load data

In [9]:
df = pd.read_csv(f"https://docs.google.com/spreadsheets/d/1edkPVtGwaCeQCMtooPD5veNrPeUOYUa8hbeFocNa0LU/export?gid=1445397290&format=csv", on_bad_lines='skip')

In [10]:
df

Unnamed: 0.1,Unnamed: 0,url,tweet_text,tweetID,gold
0,0,https://twitter.com/danwilliamsphil/status/171...,"I enjoyed this, although I was already sympath...",1719436704602275858,<review>
1,1,https://twitter.com/pwang/status/1719720728184...,100% with Stella @BlancheMinerva (and @AndrewY...,1719720728184910195,<other>
2,2,https://twitter.com/BlancheMinerva/status/1719...,There are hundreds of researches around the wo...,1719714881081954409,<other>
3,3,https://twitter.com/sucholutsky/status/1719725...,🧵 Excited to share another new paper with @coc...,1719725087681569189,<announce>
4,4,https://twitter.com/TBSocialist/status/1719676...,If you went to or watched the network state co...,1719676110785421807,<other>
5,5,https://twitter.com/DG_Rand/status/17193724593...,🚨New WP🚨\nHow can more Republicans be convince...,1719372459344888032,<announce>
6,6,https://twitter.com/yanaiela/status/1719755578...,What's In My Big Data?\n\nA question we've bee...,1719755578409619740,<announce>
7,7,https://twitter.com/bingbrunton/status/1719789...,My department at Univ Washington in Seattle is...,1719789465739333972,<job>


# Run parser on data

In [11]:
from tqdm import tqdm

results = []

for i,row in tqdm(df.iterrows(), total=len(df)):
    url = row["url"]
    result = tweet_parser.process_tweet(url)
    results.append(result)

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:43<00:00,  5.48s/it]


In [12]:

processed_results = []

for r in results:
    res_summary = {
                    "tweetID": r["tweet"]["tweetID"],
                    "tweet_text": r["tweet"]["text"],
                    "answer": r["answer"]["final_answer"],
                    "reasoning": r["answer"]["reasoning"]

                   }
    processed_results.append(res_summary)
    
res_df = pd.DataFrame(processed_results)

In [13]:
res_df = res_df.astype({"tweetID": 'int64'})

In [14]:
res_df.columns

Index(['tweetID', 'tweet_text', 'answer', 'reasoning'], dtype='object')

In [15]:
res_df[['tweetID', 'answer', 'reasoning']]

Unnamed: 0,tweetID,answer,reasoning
0,1719436704602275858,<review>,"1. The post is a review of a reference, as it ..."
1,1719720728184910195,<announce>,1. The post discusses a research-related topic...
2,1719714881081954409,<announce>,1. The post discusses safety-critical research...
3,1719725087681569189,<announce>,1. The post contains a new research paper that...
4,1719676110785421807,<other>,1. The post appears to be a personal opinion o...
5,1719372459344888032,<announce>,1. The post is about a new study related to cl...
6,1719755578409619740,<other>,1. The post is asking a question related to bi...
7,1719789465739333972,<job>,The post is about a job listing and is looking...


In [16]:
merged_df = pd.merge(df, res_df[['tweetID', 'answer', 'reasoning']], on="tweetID")
merged_df

Unnamed: 0.1,Unnamed: 0,url,tweet_text,tweetID,gold,answer,reasoning
0,0,https://twitter.com/danwilliamsphil/status/171...,"I enjoyed this, although I was already sympath...",1719436704602275858,<review>,<review>,"1. The post is a review of a reference, as it ..."
1,1,https://twitter.com/pwang/status/1719720728184...,100% with Stella @BlancheMinerva (and @AndrewY...,1719720728184910195,<other>,<announce>,1. The post discusses a research-related topic...
2,2,https://twitter.com/BlancheMinerva/status/1719...,There are hundreds of researches around the wo...,1719714881081954409,<other>,<announce>,1. The post discusses safety-critical research...
3,3,https://twitter.com/sucholutsky/status/1719725...,🧵 Excited to share another new paper with @coc...,1719725087681569189,<announce>,<announce>,1. The post contains a new research paper that...
4,4,https://twitter.com/TBSocialist/status/1719676...,If you went to or watched the network state co...,1719676110785421807,<other>,<other>,1. The post appears to be a personal opinion o...
5,5,https://twitter.com/DG_Rand/status/17193724593...,🚨New WP🚨\nHow can more Republicans be convince...,1719372459344888032,<announce>,<announce>,1. The post is about a new study related to cl...
6,6,https://twitter.com/yanaiela/status/1719755578...,What's In My Big Data?\n\nA question we've bee...,1719755578409619740,<announce>,<other>,1. The post is asking a question related to bi...
7,7,https://twitter.com/bingbrunton/status/1719789...,My department at Univ Washington in Seattle is...,1719789465739333972,<job>,<job>,The post is about a job listing and is looking...


# Evaluation

In [17]:
correct = (merged_df.answer == merged_df.gold).sum()
total = len(merged_df)
acc = correct / total
print(f"Performance summary: {correct}/{total} = {acc}")

Performance summary: 5/8 = 0.625


In [22]:
from sklearn.metrics import precision_recall_fscore_support

labels = ["<announce>", "<review>", "<job>" ,"<other>"]
precision_recall_fscore_support(merged_df.gold, merged_df.answer, labels=labels, average=None)

(array([0.5, 1. , 1. , 0.5]),
 array([0.66666667, 1.        , 1.        , 0.33333333]),
 array([0.57142857, 1.        , 1.        , 0.4       ]),
 array([3, 1, 1, 3]))