# Parses Pages into DB

In [None]:
!pip install asyncpg

In [44]:
import psycopg2
from psycopg2 import sql, pool
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import psycopg2.extras

import asyncpg

import httpx
import asyncio
import json
from bs4 import BeautifulSoup
import datetime
import time
import os
import re
from dotenv import load_dotenv

load_dotenv()
password = os.getenv("DB_PASSWORD")
postgres_srv = os.getenv("POSTGRES_SRV")
redis_srv = os.getenv("REDIS_SRV")
openai_key = os.getenv("OPENAI_KEY")

In [47]:
# db_params = {
#     "database": "star_wars_data",
#     "user": "postgres",
#     "password": password,
#     "host": postgres_srv,
#     "port": "5432",
# }
# conn_pool = psycopg2.pool.SimpleConnectionPool(1, 1000, **db_params)
# def run_sql(sql_command, data=None):
#     try:
#         conn = conn_pool.getconn()
#         conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
#         cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
#         if data is not None:
#             # print("data is not None")
#             cursor.execute(sql_command, data)
#         else:
#             # print("data is None")
#             cursor.execute(sql_command)

#         if sql_command.strip().upper().startswith('SELECT'):
#             result = cursor.fetchall()
#         else:
#             result = None
#         # print(result)
#         cursor.close()

#         # print("psql completed")
#         return result
#     except Exception as e:
#         print(f"A database error occurred: {e}")
#     finally:
#         conn_pool.putconn(conn)

pool = None

async def get_pool():
    global pool
    if pool is None:
        try:
            pool = await asyncpg.create_pool(
                database="star_wars_data",
                user="postgres",
                password=password,
                host=postgres_srv,
                port="5432",
                min_size=1,
                max_size=100,
            )
        except Exception as e:
            print(f"An error occurred while creating the pool: {e}")
            raise
    return pool

async def run_async_sql(sql_command, *params):
    pool = await get_pool()
    async with pool.acquire() as conn:
        try:
            result = await conn.fetch(sql_command, *params)
            return result
        except Exception as e:
            print(f"A database error occurred: {e}")
            return None



In [8]:

async def call_gpt(message):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_key}",
    }

    request_body = {
        "prompt": f"""You are a Star Wars fan trying to determine if an array of words \
are Star Wars words or not. You must review the list provided to you and return an array \
of all the words with a parameter indicating if they are Star Wars words or not.

Your only output should be the JSON similar to:
[{{"word": "Anankin", "starwars": "true"}}, {{"word": "apple", "starwars": "false"}}]

The list of words you must review is: {message}
""",
        "model": "gpt-3.5-turbo-instruct",
        "temperature": 0,
        "top_p": 0.95,
        "frequency_penalty": 0,
        "presence_penalty": 0,
        "max_tokens": 2000,
        "stream": False,
    }

    openai_url = "https://api.openai.com/v1/completions"
    async with httpx.AsyncClient() as client:
        response = await client.post(openai_url, json=request_body, headers=headers, timeout=60)
        if response.status_code != 200:
            print(response.status_code, response.reason_phrase)
            return {}
        print(response.json()['usage'])
    return response.json()['choices'][0]['text']


# call_gpt(json.dumps(["Anakin", "apple", "Luke", "Mando", "light", "John"]))

In [4]:
# count_db = run_sql("SELECT COUNT(*) FROM public.""all_data""")[0][0]
# print(count_db)

# batch_size = 100
# all_unique_words = set()
# for i in range(0, count_db, batch_size):
#     data = run_sql(f"SELECT * FROM public.""all_data"" ORDER BY id LIMIT %s OFFSET %s", (batch_size, i))
#     print(f"Batch {i} to {i + batch_size}")
#     print(f"ID: {data[0]['id']} TITLE: {data[0]['title']}")
#     for row in data:
#         bs = BeautifulSoup(row['content'], 'html.parser')
#         body_text = ' '.join(bs.find('main').stripped_strings)
#         words = re.findall(r'\b\w+\b', body_text)
#         all_unique_words.update([words.lower() for words in words])
#         print(len(all_unique_words), 'unique words')

all_unique_words = set()

async def process_batch(min_id, batch_size):
    global all_unique_words
    print(f"processing batch from id {min_id} to {min_id + batch_size}")
    data = await run_async_sql(f"""SELECT * FROM public."all_data" \
WHERE ID >= $1 \
ORDER BY id \
LIMIT $2""", min_id, batch_size)
    print(f"ID: {data[0]['id']} TITLE: {data[0]['title']}")

    for row in data:
        bs = BeautifulSoup(row['content'], 'html.parser')
        body_text = ' '.join(bs.find('main').stripped_strings)
        words = re.findall(r"\b[a-zA-Z0-9'-]+\b", body_text)
        all_unique_words.update([word.lower() for word in words])

    print(len(all_unique_words), 'unique words')
    return all_unique_words

async def main():
    count_db = await run_async_sql("SELECT MAX(id), MIN(id) FROM public.all_data")
    max_id, min_id = count_db[0]
    batch_size = 200
    tasks = []
    for i in range(min_id, max_id, batch_size):
        print(f'loading batch from id {i} to {i + batch_size}')
        tasks.append(process_batch(i, batch_size))

        if len(tasks) == 10:
            await asyncio.gather(*tasks)
            tasks = []

    if tasks:
        await asyncio.gather(*tasks)

await main()
print(len(all_unique_words), 'unique words')


loading batch from id 1227 to 1427
loading batch from id 1427 to 1627
loading batch from id 1627 to 1827
loading batch from id 1827 to 2027
loading batch from id 2027 to 2227
loading batch from id 2227 to 2427
loading batch from id 2427 to 2627
loading batch from id 2627 to 2827
loading batch from id 2827 to 3027
loading batch from id 3027 to 3227
processing batch from id 1227 to 1427
processing batch from id 1427 to 1627
processing batch from id 1627 to 1827
processing batch from id 1827 to 2027
processing batch from id 2027 to 2227
processing batch from id 2227 to 2427
processing batch from id 2427 to 2627
processing batch from id 2627 to 2827
processing batch from id 2827 to 3027
processing batch from id 3027 to 3227
ID: 2627 TITLE: Del Rey
23727 unique words
ID: 3028 TITLE: Antar
32770 unique words
ID: 2427 TITLE: Brentaal
42310 unique words
ID: 2027 TITLE: Transparisteel
51431 unique words
ID: 2227 TITLE: Ord Cestus
55240 unique words
ID: 1227 TITLE: Brianna
72776 unique words
ID:

In [5]:
import json

with open('outputs.json', 'w') as f:
    json.dump(outputs, f)

with open('all_unique_words.json', 'w') as f:
    json.dump(list(all_unique_words), f)

# with open('outputs.json', 'r') as f:
#     outputs = json.load(f)

# with open('all_unique_words.json', 'r') as f:
#     all_unique_words = set(json.load(f))

In [41]:
# ws = [w for w in list(all_unique_words) if len(w) > 15]
# ws = [w for w in list(all_unique_words) if re.match(r'^[\d-]*$', w)]
# ws = [w.replace("'s","") for w in list(all_unique_words) if "'s" in w]
# ws = [w for w in list(all_unique_words) if "'s" in w]

ws = [w for w in list(all_unique_words)]
print(len(ws))
print(ws)

# all_unique_words = all_unique_words - set(ws)
# all_unique_words.update(['bacta-kolto', 'point-defenses', 'devastator-anh', 'llona', 'rippled', 'gymsnor-2', 'lonegran', 'ackbar-slash', 'loof', 'meebler', 'aorspecial-pham', "ottdell's", 'jewels', 'disputem', 'clanker', 'exgal-4', 'r5-d23', 'yeaahhh', 'killingsworth', 'cyndel', 'gizkas', "heskal's", '91a', 'comprized', 'ibadan', 'epping', 'electro-eels', 'bri1-2', 'britarros', 'dissatisifying', "bursthed's", 'european', 'jekporkins-db', 'sarabwintersart', 'force-pulls', 'wolren', 'vapor', 'wingrove', 'hethrir', 'kwi', 'weber', 'thrilk', 'aziz', 'swlegacy17', "liner's", 'whiny', 'furnaces', "adni's", 'que-mars', 'greej', 'sowers', 'kaiyaadrimetrum', 'outcher', 'tts', 'antruse', 'panorama', 'ebonics', 'aptitude', 'schrei', 'colram', 'rori-space', 'pican', 'trunnel', "adolescent's", 'thefeuding', "carreras's", 'presumably', 'trash-covered', 'rump-state', 'energy-focusing', 'rainn', 'stitar', 'tomo', 'mb-12-ee', 'gangsterhappy', 'off-menu', 'rebellion', 'tis-home', "sho's", 'self-isolated', 'aquillia', '47-b', 'zagreb', 'consolidates', 'crouch', 'holo-gyms', 'tching', 'afterburners', 'brawlkers', 'sunpetals', 'vistis', 'cyborged', 'dande', 'davan-stop', 'tk-7834', 'k-ob7', 'fras', 'voxscrambler', 'v-11', 'jus', 'tentac', 'yncha', 'manta-ray', 'dive-ball', 'enlarge', "arok's", 'kuthic', 'jfk', 'warstalker', 'constraints', 'airmen', 'phattro', 'lusaanda', 'gadbois', 'power-intense', 'wrexham', 'negwtvvcover', 'micro-implants', 'lifters', 'taun', 'ninety-four', 'gripof', 'k-grade', 'fuelshipment', 'retrieving', 'oxixo', "engineering's", 'remebered', 'hardcell', 'foothill', 'tk-127', 'swang', 'odar', 'czx-4', 'transluscent', 'schoolchild', 'imperceptibly', 'mephisto', 'kander', '200th', 'tooba', 'admiral-turned', 'tri-wheeler', 'sss', 'jullisa', 'bala-bala', 'human-hair', 'juro', 'sweet-treat', 'moseley', 'labor-force', 'kelleranbeq', 'hauum', 'freher', 'absconded', 'lxr-6', 'p-04', 'ervekk', 'space-dragon', "thos's", 'knifed', 'brice', 'dabbled', 'gr', 'kathlin', 'shiredroids', 'side-board', 'alm1-4', 'outcastben', 'rec', 'calagott', 'manakron', 'truned', 'omnicompatible', 'locations2', "lexi's", 'winter-swf', 'marimba', 'kneesaa', 'kalikori', 'converter', 'berinato', 'tendril-moss', 'nas', 'skel-rat', "hu'un", 'kaidan', 'acquisitor', "zeen's", 'greyish', '2j-3po', 'betsy', 'messert', 'quincunx', 'real-word', "tymo's", 'jarequanil', 'cuando', 'sarreti', 'cadmir', 'tk-290', 'luitt', 'callum', 'fac5-6', 'travelto', 'downwind', 'cannister', 'bonderium', 'schlanski', 'light-minutes', "rsf's", 'siltcrawlers', 'sondgroth', 'incineration', 'violate', 'gray-tan', 'groundcrawler', 'anddesperate', 'free-lanced', 'jedi-general', 'loveof', 'iasa', 'pale-gray', "sandbag's", 'darkswords', 'aarocket', 'dory', 'hawk-class', 'kayven', 'prule', 'fd3', '0035-791x', 'kisheb', "noctyss's", 'bottom-front', 'pontite', 'swelling', 'grantgoboom', 'gigorans', 'blau', 'tukalda', 'collisioncourse', 'daini', 'towersey', 'movable', 'scoundreless', "bight's", 'b2-battle', 'tal-yannar', 'koros', 'imperialforces', 'kadril', 'megaliter', 'abeksstation', "kli'aari", 'saskapeg', 'fanry1', 'microgarments', 'wars-y', 'hookworms', 'shipwrights', 'kerred', 'nuh', 'archaeo-prime', "adalric's", "adi's", 'kuthara', 'xiz5-6', 'battlesimulator', 'uniform', 'woodhouse', 'waar', 'segal', 'winera', 'bidepal', 'bodya', 'pictural', "clarr'upi'ovmos", "lazlik's", 'ankkishor-xim', 'vukurah', 'vorskl', 'iwasawa', 'snuk', 'surpluses', 'lurr', 'vdsiay', 'leapfar', 'tropas', 'jerba-skin', 'ultra-sonic', 'btmf42', 'edvar', 'liuv-bf2', 'sameness', 'chastina', '169th', 'dataworker', 'lotarenct', 'herald-class', 'mellcrawler', 'd-7s1', 'egresses', 'basdor', 'caspar', 'budget-minded', 'ovat', "zephee's", "vado's", 'yotuu', 'flagston', 'cohden', 'sleep-cocoons', 'thermoweave', 'pherins', 'sheeli', 'inaugurating', 'thrantas', 'yardle', "ulf's", 'organising', "sardu's", 'trolleys', 'sheldrew', 'religions', 'untergang', 'rosters', 'axing', 'open-source', "breeder's", 'matsumoto', 'ma-35', 'hernon-class', 'rrhr', "sansia's", 'burn-treatment', 'encyclical', 'nishimoto', 'isb', 'odera', 'plitex', 'squirm', 'xlv', 'deletedscene', 'herselfto', 'magnett', 'yeti', 'readimeal', 'lyonel', 'muck-lakes', 'weak-points', 'electro-baffler', 'chittering', 'bonded', 'ocean-side', "herd's", "fellow's", 'skyhawks', 'lord-in-waiting', 'brega', 'skocha', 'fangers', 'bridget', 'colonizers', 'curiel', "murr's", 'seance', 'churruma', 'leesa', 'jedidiahstrike', 'spikers', "navicomputer's", 'laurie', 'rc-3423', 'haetsal', 'crossrail', 'reunify', 'onimo', 'snarlap', 'djovi', "r-55's", 'frangelic', 'orderhas', 'holoposters', 'div2', 'spindle', 'pending', 'one-man', 'jenndevaad1', 'wdtvpress', "rading's", "gio's", 'jixuan', 'antique', 'done-in-one', 'awings', 'astrogating', "jelcho's", 'cani', 'lois', 'swd18', 'aaaaaaargh', 'mclassinflight', 'keicho', "redath-gom's", 'velken', 'rescuevictory', 'ep1-locations', 'anati', 'epoxatal', 'over-turned', 'ion-prod', 'lumineux', 'thallium', 'f-23', 'rii', 'inflatable', 'crustacean', 'orcel', 'aj10', 'worldview', 'zierfub', 'bombo', "who've", 'b135', 'talir', "neek's", "thrass's", 'expansionism', 'rope-making', 'plagiarized', 'powerlock', 'early-mid', 'hahahaha', 'arboretums', 'cloudflowers', 'stassia', 'sires', 'eyebrow', 'companions', 'endoskeleton', 'b-14', 'shadowsmini4', 'shabell', 'caracters', 'ex-fighter', 'limitless', "recruiter's", 'up-chuck', 'landspeeders', 'thrillers', 'werba', 'knife-dance', 'algar', 'jian', 'changed', 'betshishi', 'alzahi', 'roadbump', 'aered', 'prye', 'dookuandmoralo', 'war-chief', 'alpha-four', "in's", "tm-171's", 'dwelt', 'jianxu', 'caferaduli-tlj', 'assumin', 'thrust-boosters', 'shetler', 'murttoc-yine', 'sikorsky', 'maxsec', 'grandmother', "foot's", 'sk-z38-close', 'rampart-class', 'slow', 'balka', 'first-contact', 'zahn-questions', 'tinoktin', 'sharp-cornered', 'holo-targeting', 'readerlink', 'jaksivrak-aabr', 'tradeco', 'piekna', 'grey', 'petrovia', 'comm-net', 'upoloaded', 'bloodscars', 'vos-centric', 'nrc-077', 'entrail', 'demobilize', 'canopus', 'floatcar', 'rayshields', 'palp-frieze', 'xaisnow', "turnik'otta", 'phantrong', 'rebeccasoler', 'splint', 'mchecc', 'rb-1', 'sharma', 'braz', 'lien-tsai', 'well-developed', 'rekks', 'verisimilitude', 'seasonofrevolt', 'text-doc', 'script-accurate', 'charqacters', "m3-ob's", 'spokesman', 'echuu-shen', 'para-sailing', 'idk', 'sureness', 'syward', 'shrouded', 'caracterized', "aarp's", 'orentiis', 'lbk', 'dag12', 'nastyboy', 'mission400', 'humbabaweg', 'filli', 'forces', 'vxl', 'c5h4', 'onacanda', 'involed', 'polta', 'devoid', 'earthshaking', 'sub-foundations', 'gurk', 'shyon', "pantora's", 'drakesurrounded', 'oret', 'hopeto', 'unrelentingly', 'goen', 'oquan', 'brain-barrel', 'thebigevent', 'prolongthe', 'plainlink', 'sequentially', 'organises', 'splay-toed', 'smooches', 'nagnol', 'tawas', 'wreacked', "bacon's", 'bloodtype', 'paxisylo', 'ujek', 'anti-treaty', 'synonym', 'erodes', 'prison-like', 'zakthok', 'genteel', "threedee's", 'jfos2', 'fleetjust', "film's", 'computer', 'royalpalaceraid', 'ken5', "drassk's", "damutek's", 'courteous', 'cryo', 'thibs', 'argul', "shaunessy's", 'sylvarcathar', 'bootolubble', '4814758ca', "svorno's", 'forcewindu', 'dimma', 'pleni', 'outspokenness', 'slumber', 'osborne', 'caple', 'stonebone', 'nhis', 'sand-encrusted', 's60', 'havinavonreg', 'dafi', 'tasteless', 'malgan', 'streicher', "nana's", 'willrow', 'crosman', 'shepperton', 'delphidians', 'fal', 'gorryl', "blevin's", 'jeditreasure', 'mcnichol', 'bustcollection8', 'yaag', 'okyaab', 'paley', 'nestle-comics', 'cru1-4', 'accarrgm', 'susaut', 'fattremayne', 'tekar', 'misdeeds', 'explanations', 'class-pet', 'shodu-wgf', 'worldcarnelion', 'recon-a', 'rowr', 'executorxwing', '13-k', 'ar-72', 'lirin', 'qwisp', 'insulation', 'republicana', 'snorts', 'huycks', 'jedidarksidetpb', 'still-loyal', 'legacyii', 'korvella', 'cun', 'hexa-crystals', 'hepta-mag', 'edru', 'ships--most', 'architectually', 'newly-staffed', 'istcav', 'hyperspace', 'houston-hoth', 'treftite', 'acklay', 'hypochondriac', 'lfi', 'demicot', 'chiu', "dk-9's", 'granola', 'till-ee', 'non-ithorian', 'dead-reckoning', "azard's", 'drummers', 'friedrich', 'hist', 'importers', 'asw4', 'unsuspectingly', 'upgrad', "ghymnon's", 'di', 'kreesis', 'epicly', 'ques', 'force-carrying', 'toride', 'fair', 'cartariun', 'stumble', 'hemm', "kind's", "croom's", 'mushk', '90-year-old', 'll', 'exactlywho', 'loki', 'gharsa', 'yu-yan', 'ryndellia', "ludi's", 'pdfs', 'ingvild', 'drinker', 'vinecrawlers', 'payable', 'crazyman300', 'palmyra', 'physiologist', 'rialla', "whett's", 's1ep2', 'ingruber', 'permanence', 'coming-of-age', 'ashfordly', 'algarian', 'circulating', 'tablink', 'matric', "tc-72's", 'bra-a-a-ats', 'chedd', "solusar's", 'sepma', 'zipo', 'afghanistan', 're-power', '02bonus', 'granule', 'three-world', "baton's", 're-certified', 'sithapr', 'inducer-droids', 'sw52coverfinal', 'qlint', '326-kk', 'ferret-class', 'nihilusatris2', 'unbroadcasted', 'taug', 'kamaboko', "tu'lus", 'sixty-five', "kinyov's", 'elee', 'nacronis', 'yahn', 'geometric', "char'shen", 'cor11-12', 'lunacca-tsotw', "hk-01's", 'mililiter', "kegan's", 'takes', 'galerha', 'rumpled', 'byhdee', "thar'lyat", 'bd20', 'meer-rat', 'f-f-for', 'manux', 'diehard1', 'vintollo', 'swinsider159iii', 'shoehorns', 'detracted', 'lower', 'thomsen', 'plure', 'awing-tswa', 'ordomire', "tigo's", 'seanan', 'rots32', 'frivolous', 'zhurst', 'munificents', 'forges', 'cream-filled', 'boo', 'speakeasy', 'wbh5parel', 'starwars1977-27', 'bellew', 'fodro', "zgorth'sth", "jerserra's", 'geelanalen', 'self-alteration', 'grinjer', 'st-o5', 'maligning', 'bruner', 'panatic', 'kracken', 'grackelcat'])
# all_unique_words.update(ws)

outputs = []

273507


In [6]:
output_words = []
for o in outputs:
  output_words.extend(json.loads(o))
output_words = [word['word'] for word in output_words]

missing_words = [word for word in all_unique_words if word not in output_words]
print(len(outputs))
print(len(output_words), output_words[:10])
print(len(all_unique_words))
print(len(missing_words))

310
30998 ['gratuitous', 'raxusduel', '', 'lonchan', 'base-bound', 'cere-ious', 'thepowerful', 'jasmine', 'hierarchic', 'crossbreed']
273507
242509


In [15]:
import asyncio

async def main():
    global outputs
    gpt_batch = 100
    cnt = 0
    tasks = []

    print('finding missing words')
    output_words = []
    for o in outputs:
      output_words.extend(json.loads(o))
    output_words = [word['word'] for word in output_words]
    missing_words = [word for word in all_unique_words if word not in output_words]
    total = len(list(missing_words))
    print(f"processing {total} words from {len(all_unique_words)}")
    while cnt < total:
        print(f"getting batch {cnt+gpt_batch}")
        tasks.append(call_gpt(missing_words[cnt:cnt+gpt_batch]))
        cnt += gpt_batch

        if len(tasks) == 10:
          outputs.extend(await asyncio.gather(*tasks))
          tasks = []

    outputs.extend(await asyncio.gather(*tasks))

# Create a task for the main coroutine and await it to run it to completion
await main()
print(f'total outputs: {len(outputs)}')

finding missing words


TypeError: the JSON object must be str, bytes or bytearray, not dict

In [29]:

for i in range(0, len(outputs), 1):
  o = outputs[i]
  try:
    json.loads(o)
  except Exception as e:
    print(e)
    print(i, o)



In [48]:
words = []
for o in outputs:
    words.extend(json.loads(o))

display(f"{len(outputs)} responses with {len(words)} total words")
display(f"{len([o for o in words if o['starwars'] == 'true'])} star wars words")
display(f"{len([o for o in words if o['starwars'] == 'false'])} non star wars words")

display([o for o in words if o['starwars'] == 'true'])

async def inserts():
    tasks = []
    for i in range(0, len(words),1):
        o = words[i]
        tasks.append(run_async_sql("INSERT INTO public.words (word, word_length, is_star_wars, first_letter) VALUES ($1, $2, $3, $4)", o['word'], len(o['word']), o['starwars'] == 'true', o['word'][:1]))
        if len(tasks) == 10:
            print(f'writing {i}')
            await asyncio.gather(*tasks)
            tasks = []

    await asyncio.gather(*tasks)


await inserts()

'2732 responses with 273127 total words'

'16955 star wars words'

'256172 non star wars words'

[{'word': 'raxusduel', 'starwars': 'true'},
 {'word': 'fo-4', 'starwars': 'true'},
 {'word': 'mpo-1400', 'starwars': 'true'},
 {'word': 'win-6', 'starwars': 'true'},
 {'word': 'stormtroooper', 'starwars': 'true'},
 {'word': 'prosecutor-rc', 'starwars': 'true'},
 {'word': 's-4qd', 'starwars': 'true'},
 {'word': 'kl-2b', 'starwars': 'true'},
 {'word': "tra'tade", 'starwars': 'true'},
 {'word': "kith'araquia", 'starwars': 'true'},
 {'word': 'an-21', 'starwars': 'true'},
 {'word': 'b-7h9', 'starwars': 'true'},
 {'word': 'devastator-anh', 'starwars': 'true'},
 {'word': 'gymsnor-2', 'starwars': 'true'},
 {'word': 'clanker', 'starwars': 'true'},
 {'word': 'exgal-4', 'starwars': 'true'},
 {'word': 'r5-d23', 'starwars': 'true'},
 {'word': 'gizkas', 'starwars': 'true'},
 {'word': 'electro-eels', 'starwars': 'true'},
 {'word': 'bri1-2', 'starwars': 'true'},
 {'word': 'britarros', 'starwars': 'true'},
 {'word': 'jekporkins-db', 'starwars': 'true'},
 {'word': 'force-pulls', 'starwars': 'true'},
 {'

writing 9
writing 19
writing 29
writing 39
writing 49
writing 59
writing 69
writing 79
writing 89
writing 99
writing 109
writing 119
writing 129
writing 139
writing 149
writing 159
writing 169
writing 179
writing 189
writing 199
writing 209
writing 219
writing 229
writing 239
writing 249
writing 259
writing 269
writing 279
writing 289
writing 299
writing 309
writing 319
writing 329
writing 339
writing 349
writing 359
writing 369
writing 379
writing 389
writing 399
writing 409
writing 419
writing 429
writing 439
writing 449
writing 459
writing 469
writing 479
writing 489
writing 499
writing 509
writing 519
writing 529
writing 539
writing 549
writing 559
writing 569
writing 579
writing 589
writing 599
writing 609
writing 619
writing 629
writing 639
writing 649
writing 659
writing 669
writing 679
writing 689
writing 699
writing 709
writing 719
writing 729
writing 739
writing 749
writing 759
writing 769
writing 779
writing 789
writing 799
writing 809
writing 819
writing 829
writing 839
wri

In [35]:
anakin = data[1]
print(anakin['title'], anakin['url'])

Anakin Skywalker https://starwars.fandom.com/wiki/Anakin_Skywalker


In [36]:
bs = BeautifulSoup(anakin['content'], 'html.parser')


In [41]:
appearances_html = bs.find(id="Appearances").find_next('table')

appearances = appearances_html.find_all('li')
for row in appearances[:10]:
    print(row.text)

The High Republic (2023) 7 (Variant cover only)
Star Wars: Force Arena
Star Wars: Galactic Defense
Mace Windu 3 (Variant cover only)
Master & Apprentice
Master & Apprentice audiobook
Darth Maul – Black, White & Red 1 (Mentioned only; in the opening crawl)
Phantom Menace 25th Anniversary Special 1
Queen's Peril
Queen's Peril audiobook


In [None]:
body_text = bs.find('main').text
words = re.findall(r'\b\w+\b', body_text)
# print(len(words), words[:10])
unique_words = set(words)
print(len(unique_words), 'unique words')


batch = 50
cnt = 0
outputs = []
while cnt < len(unique_words):
    output = call_gpt(list(unique_words)[cnt:cnt+batch])
    cnt += batch
    print(f'retrieved {len(json.loads(output))}')
    outputs.extend(json.loads(output))

print(f'total outputs: {len(outputs)}')

In [52]:
# put all words that are 5 letters in length in a new array
five_letter_words = [word for word in unique_words if len(word) == 5]
print(len(five_letter_words), 'five letter words')

1545 five letter words
