In [20]:
import pandas as pd
import os
import json
import datetime as dt
from dateutil import parser 
import re
from concurrent.futures import ThreadPoolExecutor
from utilities.preprocessors import normalize_and_clean, extract_keys_values, clean_and_split_data

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
input_dir = './SisyphusAI input'
files = os.listdir(input_dir)
files

['Dataset#5642tweets-sample.txt',
 'Dataset#5643dataset_twitter-scraper_2024-11-02_16-42-39-127.json']

In [22]:
def read_json_files(input_dir: str, files: list[str]):
    def helper(file_name):
        with open(f'{input_dir}/{file_name}', 'r') as file:
            data = json.load(file)
            file.close()

        new_name = file_name.replace('.json', '')
        return new_name, data

    # concurrently read and load all .json files
    with ThreadPoolExecutor() as exe:
        jsons = dict(list(exe.map(helper, files)))
    return jsons

In [23]:
json_files = [file for file in files if file.endswith('.json')]
jsons = read_json_files(input_dir=input_dir, files=json_files)
jsons

{'Dataset#5643dataset_twitter-scraper_2024-11-02_16-42-39-127': [{'id': 0,
   'location': '',
   'conversation_id_str': '1424036915980943368',
   'created_at': 'Sat Aug 07 15:57:12 +0000 2021',
   'display_text_range': [0, 32],
   'entities': {'user_mentions': [],
    'urls': [],
    'hashtags': [],
    'symbols': [],
    'media': []},
   'favorite_count': 1148,
   'favorited': False,
   'full_text': 'One must imagine Sisyphus happy.',
   'id_str': '1424036915980943368',
   'lang': 'en',
   'permalink': '/0xSisyphus/status/1424036915980943368',
   'possibly_sensitive': False,
   'quote_count': 25,
   'reply_count': 218,
   'retweet_count': 131,
   'retweeted': False,
   'text': 'One must imagine Sisyphus happy.',
   'user': {'blocking': False,
    'created_at': 'Wed Mar 03 01:58:19 +0000 2021',
    'default_profile': True,
    'default_profile_image': False,
    'description': 'Roll boulder up hill, it rolls back down.',
    'entities': {'description': {'urls': []}, 'url': {}},
    'fa

In [24]:
len(jsons['Dataset#5643dataset_twitter-scraper_2024-11-02_16-42-39-127'])

100

In [25]:
jsons['Dataset#5643dataset_twitter-scraper_2024-11-02_16-42-39-127'] = dict(zip(range(len(jsons['Dataset#5643dataset_twitter-scraper_2024-11-02_16-42-39-127'])), jsons['Dataset#5643dataset_twitter-scraper_2024-11-02_16-42-39-127']))

In [26]:
outputs = []
for dataset_name, json in jsons.items():
    outputs.append(
        (dataset_name, extract_keys_values(json))
    )

In [27]:
outputs

[('Dataset#5643dataset_twitter-scraper_2024-11-02_16-42-39-127',
  ['id is 0',
   'location is ',
   'conversation_id_str is 1424036915980943368',
   'created_at is Sat Aug 07 15:57:12 +0000 2021',
   'display_text_range is 0',
   'display_text_range is 32',
   'favorite_count is 1148',
   'favorited is False',
   'full_text is One must imagine Sisyphus happy.',
   'id_str is 1424036915980943368',
   'lang is en',
   'permalink is /0xSisyphus/status/1424036915980943368',
   'possibly_sensitive is False',
   'quote_count is 25',
   'reply_count is 218',
   'retweet_count is 131',
   'retweeted is False',
   'text is One must imagine Sisyphus happy.',
   'blocking is False',
   'created_at is Wed Mar 03 01:58:19 +0000 2021',
   'default_profile is True',
   'default_profile_image is False',
   'description is Roll boulder up hill, it rolls back down.',
   'fast_followers_count is 0',
   'favourites_count is 93910',
   'follow_request_sent is False',
   'followed_by is False',
   'followe

In [29]:
output_dir = './SisyphusAI output'
for name, lists in outputs:
    clean_and_split_data(name, lists, output_dir=output_dir, cleaner=normalize_and_clean)

id is 0
location is
conversation id str is
created at is sat aug 07 15:57:12 0000 2021
display text range is 0
display text range is 32
favorite count is 1148
favorited is false
full text is one must imagine sisyphus happy.
id str is
lang is en
permalink is 0xsisyphus status
possibly sensitive is false
quote count is 25
reply count is 218
retweet count is 131
retweeted is false
text is one must imagine sisyphus happy.
blocking is false
created at is wed mar 03 01:58:19 0000 2021
default profile is true
default profile image is false
description is roll boulder up hill, it rolls back down.
fast followers count is 0
favourites count is 93910
follow request sent is false
followed by is false
followers count is 136966
following is false
friends count is 400
has custom timelines is false
id is 0
id str is
is translator is false
listed count is 3012
location is
media count is 1502
name is sisyphus
normal followers count is 136966
notifications is false
profile banner url is profile banners
p

In [31]:
def read_txt_files(input_dir: str, files: list[str]):
    def helper(file_name):
        with open(f'{input_dir}/{file_name}', 'r') as file:
            data = file.readlines()
            file.close()

        new_name = file_name.replace('.txt', '')
        return new_name, data

    # concurrently read and load all .json files
    with ThreadPoolExecutor() as exe:
        txts = dict(list(exe.map(helper, files)))
    return txts

In [32]:
txt_files = [file for file in files if file.endswith('.txt')]
txts = read_txt_files(input_dir=input_dir, files=txt_files)
txts

{'Dataset#5642tweets-sample': ['One must imagine Sisyphus happy\n',
  'Do crackheads ever say Im too broke to smoke crack tonight  No They get up and make it happen  Lesson in there\n',
  'Ive fumbled generational wealth like 6 times this week\n',
  'The most evergreen meme will always be the founders pet  If Vitalik had a pet the resultant coin would likely be in the top 20 We will simply have to get Rajs one there instead\n',
  'some levels im watching on the 450year chart for stocks  think pretty strong support at 1930s levels if we break there back to 1812\n',
  'SBF mentions a video game in the NYT piece called Storybook Brawl   It turns out FTX actually owns this game he used the article as a promoted ad basically\n',
  'The Bahamian Supreme Court is now one of the largest holders of Ethereum\n',
  'The key to cryptocurrency investing is to be the guy on the right and sell for real world assets at appropriate times\n',
  'FTX not disabling trading remaining users are trading wild

In [33]:
output_dir = './SisyphusAI output'
for name, lists in txts.items():
    clean_and_split_data(name, lists, output_dir=output_dir, cleaner=normalize_and_clean)

one must imagine sisyphus happy
do crackheads ever say im too broke to smoke crack tonight no they get up and make it happen lesson in there
ive fumbled generational wealth like 6 times this week
the most evergreen meme will always be the founders pet if vitalik had a pet the resultant coin would likely be in the top 20 we will simply have to get rajs one there instead
some levels im watching on the 450year chart for stocks think pretty strong support at 1930s levels if we break there back to 1812
sbf mentions a video game in the nyt piece called storybook brawl it turns out ftx actually owns this game he used the article as a promoted ad basically
the bahamian supreme court is now one of the largest holders of ethereum
the key to cryptocurrency investing is to be the guy on the right and sell for real world assets at appropriate times
ftx not disabling trading remaining users are trading wildly with monopoly money and it affects prices on all other exchanges
memecoins are making peopl

In [1]:
from utilities.utilities import rename_all 

In [2]:
rename_all('C:/Users/LARRY/Documents/Scripts/virtuals-internship/SisyphusAI output')