In [11]:
import json
import sys
from typing import List
sys.path.append("../")

import pandas as pd

from desci_sense.schema.post import RefPost
from desci_sense.dataloaders.twitter.twitter_archive_loader import TwitterArchiveLoader
from desci_sense.dataloaders.twitter.twitter_archive_parser import parse_tweets, PathConfig, extract_username, read_json_from_js_file
from desci_sense.dataloaders.twitter.twitter_utils import convert_archive_tweet_to_ref_post


In [3]:
archive_path = "/home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt"
tw_archive_loader = TwitterArchiveLoader()

In [5]:
ref_posts = tw_archive_loader.load_ref_posts_from_archive_dir(archive_path, cutoff_date="2024-01-01")
len(ref_posts)

Parsing /home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt/data/account.js...
Parsing /home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt/data/tweets.js...


91

In [14]:
def create_dataframe_from_refposts(ref_posts: List[RefPost]):
    # Extracting data from each RefPost
    data = [{
        'post_url': post.url,
        'text': post.content,
        'date': post.created_at,
        'ref_urls': post.ref_urls
    } for post in ref_posts]

    # Creating a DataFrame
    df = pd.DataFrame(data, columns=['post_url', 'text', 'date', 'ref_urls'])

    return df


In [15]:
df = create_dataframe_from_refposts(ref_posts)
df

Unnamed: 0,post_url,text,date,ref_urls
0,https://twitter.com/rtk254/status/175014931339...,"""“We’ll have flying cars before we will have A...",2024-01-24 13:31:27+00:00,[https://www.edsurge.com/news/2024-01-22-a-tec...
1,https://twitter.com/rtk254/status/174998851351...,RT @CultureDevelops: Our developmental perspec...,2024-01-24 02:52:29+00:00,[]
2,https://twitter.com/rtk254/status/174964249224...,https://t.co/CZSWlkPtgz https://t.co/K5ZmmvEmNe,2024-01-23 03:57:31+00:00,[https://twitter.com/brightabyss/status/164543...
3,https://twitter.com/rtk254/status/174964148174...,The human era will soon be over if we don't st...,2024-01-23 03:53:30+00:00,[]
4,https://twitter.com/rtk254/status/174958553109...,RT @BenzionSanders: “My name is Roni. I lost m...,2024-01-23 00:11:10+00:00,[]
...,...,...,...,...
86,https://twitter.com/rtk254/status/174184164097...,@AsteraInstitute Nanopublications @nanopub_org...,2024-01-01 15:19:43+00:00,[]
87,https://twitter.com/rtk254/status/174184163337...,@AsteraInstitute (1) Lack of support for diver...,2024-01-01 15:19:41+00:00,[]
88,https://twitter.com/rtk254/status/174184162201...,@AsteraInstitute But first - why do we even ne...,2024-01-01 15:19:39+00:00,[]
89,https://twitter.com/rtk254/status/174184161251...,@AsteraInstitute Sensemaking Networks is takin...,2024-01-01 15:19:36+00:00,[https://twitter.com/chazfirestone/status/1727...


In [2]:
archive_path = "/home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt"

paths = PathConfig(dir_archive=archive_path)
# Extract the archive owner's username from data/account.js
username = extract_username(paths)
users = {}
# tweets = parse_tweets(username, users, paths)

Parsing /home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt/data/account.js...


In [3]:
tweets = []
for tweets_js_filename in paths.files_input_tweets:
    json = read_json_from_js_file(tweets_js_filename)
    tweets += json

Parsing /home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt/data/tweets.js...


In [4]:
tweet = tweets['/home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt/data/tweets.js'][0]
tweet

{'tweet': {'edit_info': {'initial': {'editTweetIds': ['1750149313399832818'],
    'editableUntil': '2024-01-24T14:31:27.000Z',
    'editsRemaining': '5',
    'isEditEligible': True}},
  'retweeted': False,
  'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [],
   'urls': [{'url': 'https://t.co/HK6aOfX4Dm',
     'expanded_url': 'https://www.edsurge.com/news/2024-01-22-a-technologist-spent-years-building-an-ai-chatbot-tutor-he-decided-it-can-t-be-done',
     'display_url': 'edsurge.com/news/2024-01-2…',
     'indices': ['216', '239']}]},
  'display_text_range': ['0', '239'],
  'favorite_count': '0',
  'id_str': '1750149313399832818',
  'truncated': False,
  'retweet_count': '0',
  'id': '1750149313399832818',
  'possibly_sensitive': False,
  'created_at': 'Wed Jan 24 13:31:27 +0000 2024',
  'favorited': False,
  'full_text': '"“We’ll have flying cars before we will have AI tutors,” he

In [5]:
ref_post = convert_archive_tweet_to_ref_post(tweet, username)
ref_post.dict()

{'author': 'rtk254',
 'content': '"“We’ll have flying cars before we will have AI tutors,” he says. “It is a  deeply human process that AI is hopelessly incapable of meeting in a  meaningful way. It’s like being a therapist or like being a nurse.”"\n\nhttps://t.co/HK6aOfX4Dm',
 'url': 'https://twitter.com/rtk254/status/1750149313399832818',
 'created_at': datetime.datetime(2024, 1, 24, 13, 31, 27, tzinfo=datetime.timezone.utc),
 'metadata': {},
 'source_network': 'twitter',
 'type': 'ReferencePost',
 'ref_urls': ['https://www.edsurge.com/news/2024-01-22-a-technologist-spent-years-building-an-ai-chatbot-tutor-he-decided-it-can-t-be-done']}

In [8]:
convert_archive_tweet_to_ref_post(tweets['/home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt/data/tweets.js'][1], username).dict()

{'author': 'rtk254',
 'content': 'RT @CultureDevelops: Our developmental perspective is founded on a “New Truth,” which is now becoming increasingly evident:\n\nCulture and co…',
 'url': 'https://twitter.com/rtk254/status/1749988513519481012',
 'created_at': 'Wed Jan 24 02:52:29 +0000 2024',
 'metadata': {},
 'source_network': 'twitter',
 'type': 'ReferencePost',
 'ref_urls': []}

In [10]:
post_json = tweets['/home/rkl25/dev/common_sense/desci-sense/etc/data/twitter_archive_rt/data/tweets.js'][1]
post_json

{'tweet': {'edit_info': {'initial': {'editTweetIds': ['1749988513519481012'],
    'editableUntil': '2024-01-24T03:52:29.494Z',
    'editsRemaining': '5',
    'isEditEligible': False}},
  'retweeted': False,
  'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [{'name': 'Steve McIntosh',
     'screen_name': 'CultureDevelops',
     'indices': ['3', '19'],
     'id_str': '1215013239102115840',
     'id': '1215013239102115840'}],
   'urls': []},
  'display_text_range': ['0', '140'],
  'favorite_count': '0',
  'id_str': '1749988513519481012',
  'truncated': False,
  'retweet_count': '0',
  'id': '1749988513519481012',
  'created_at': 'Wed Jan 24 02:52:29 +0000 2024',
  'favorited': False,
  'full_text': 'RT @CultureDevelops: Our developmental perspective is founded on a “New Truth,” which is now becoming increasingly evident:\n\nCulture and co…',
  'lang': 'en'}}

In [14]:
def is_retweet(post):
    """
    Check if the given post is a retweet.

    Args:
    post_json (str): A JSON string representing the post.

    Returns:
    bool: True if the post is a retweet, False otherwise.
    """
    try:
        
        # Get the full_text field from the post
        full_text = post.get('tweet', {}).get('full_text', '')
        
        # Check if the full_text starts with the retweet pattern
        return full_text.startswith('RT @')
    except json.JSONDecodeError:
        # In case of JSON parsing error
        print("Invalid JSON format")
        return False
    
 

In [15]:
print(is_retweet(post_json))

True


In [19]:
from datetime import datetime

created_at_str = "Wed Jan 24 02:52:29 +0000 2024"
created_at_dt = datetime.strptime(created_at_str, '%a %b %d %H:%M:%S %z %Y')
str(created_at_dt)

'2024-01-24 02:52:29+00:00'

In [20]:
type(created_at_dt)

datetime.datetime