In [7]:
# imports
import mwclient  # for downloading example Wikipedia articles
import mwparserfromhell  # for splitting Wikipedia articles into sections
import openai  # for generating embeddings
import os  # for environment variables
import pandas as pd  # for DataFrames to store article sections and embeddings
import re  # for cutting <ref> links out of Wikipedia articles
import tiktoken  # for counting tokens
from dotenv import dotenv_values

  and should_run_async(code)


In [8]:
# pip install tiktoken

  and should_run_async(code)


In [9]:
api_key = dotenv_values("api_data.env")
openai.api_key = api_key['OPEN_AI_KEY']

  and should_run_async(code)


KeyError: 'OPEN_AI_KEY'

In [70]:
def titles_from_category(
    category: mwclient.listing.Category, max_depth: int
) -> set():
    """Return a set of page titles in a given Wiki category and its subcategories."""
    titles = set()
    for cm in category.members():
        if type(cm) == mwclient.page.Page:
            # ^type() used instead of isinstance() to catch match w/ no inheritance
            titles.add(cm.name)
        elif isinstance(cm, mwclient.listing.Category) and max_depth > 0:
            deeper_titles = titles_from_category(cm, max_depth=max_depth - 1)
            titles.update(deeper_titles)
    return titles

  and should_run_async(code)


In [71]:
CATEGORY_TITLE = "Category:Financial risk"
WIKI_SITE = "en.wikipedia.org"
# https://en.wikipedia.org/wiki/Category:Finance

site = mwclient.Site(WIKI_SITE)
category_page = site.pages[CATEGORY_TITLE]
titles = titles_from_category(category_page, max_depth=1)
# ^note: max_depth=1 means we go one level deep in the category tree
#print(f"Found {len(titles)} article titles in {CATEGORY_TITLE}.")

  and should_run_async(code)


In [73]:
print(f"Found {len(titles)} article titles in {CATEGORY_TITLE}.")

Found 334 article titles in Category:Financial risk.


  and should_run_async(code)


In [77]:
titles

  and should_run_async(code)


{'1991 Indian economic crisis',
 '2007–2008 financial crisis',
 '80:125 rule',
 'Absorbing barrier (finance)',
 'Acceptance set',
 'Actuarial science',
 'Additional insured',
 'Advanced IRB',
 'Adverse selection',
 'Alternative risk transfer',
 'Altman Z-score',
 'Annualized loss expectancy',
 'Asset and liability management',
 'Asset-backed commercial paper program',
 'Automotive warranty',
 'Bank run',
 'Banking book',
 'Barnewall Two-way Model',
 'Basel III',
 'Basis risk',
 'Beneish M-score',
 'Betavexity',
 'Bielard, Biehl and Kaiser five-way model',
 'Bond credit rating',
 'Bonus–malus',
 'CAMELS rating system',
 'CS01',
 'Cancellation (insurance)',
 'Capital asset pricing model',
 'Capital requirement',
 'Cascades in financial networks',
 'Cascading failure',
 'Cash flow at risk',
 'Cash flow hedge',
 'Cash value',
 'Cashflow matching',
 'Cell captive',
 'Central Insurance of Iran',
 'Central counterparty clearing',
 'Chan–Karolyi–Longstaff–Sanders process',
 'Climate risk',
 'C

In [81]:
titles1

  and should_run_async(code)


{'1991 Indian economic crisis'}

In [98]:
SECTIONS_TO_IGNORE = [
    "See also",
    "References",
    "External links",
    "Further reading",
    "Footnotes",
    "Bibliography",
    "Sources",
    "Citations",
    "Literature",
    "Footnotes",
    "Notes and references",
    "Photo gallery",
    "Works cited",
    "Photos",
    "Gallery",
    "Notes",
    "References and sources",
    "References and notes",
]

  and should_run_async(code)


In [113]:
def all_subsections_from_section(
    section: mwparserfromhell.wikicode.Wikicode,
    parent_titles: list,
    sections_to_ignore: set,
) -> list:
    """
    From a Wikipedia section, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    headings = [str(h) for h in section.filter_headings()]
    title = headings[0]
    if title.strip("=" + " ") in sections_to_ignore:
        # ^wiki headings are wrapped like "== Heading =="
        return []
    titles = parent_titles + [title]
    full_text = str(section)
    section_text = full_text.split(title)[1]
    if len(headings) == 1:
        return [(titles, section_text)]
    else:
        first_subtitle = headings[1]
        section_text = section_text.split(first_subtitle)[0]
        results = [(titles, section_text)]
        for subsection in section.get_sections(levels=[len(titles) + 1]):
            results.extend(all_subsections_from_section(subsection, titles, sections_to_ignore))
        return results


def all_subsections_from_title(
    title: str,
    sections_to_ignore: set = SECTIONS_TO_IGNORE,
    site_name: str = WIKI_SITE,
) -> list:
    """From a Wikipedia page title, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    site = mwclient.Site(site_name)
    page = site.pages[title]
    text = page.text()
    print("text: ",text)
    parsed_text = mwparserfromhell.parse(text)
    headings = [str(h) for h in parsed_text.filter_headings()]
    if headings:
        summary_text = str(parsed_text).split(headings[0])[0]
    else:
        summary_text = str(parsed_text)
    results = [([title], summary_text)]
#     for subsection in parsed_text.get_sections(levels=[2]):
#         results.extend(all_subsections_from_section(subsection, [title], sections_to_ignore))
    return results

  and should_run_async(code)


In [114]:
# split pages into sections
# may take ~1 minute per 100 articles
wikipedia_sections = []
for title in titles1:
    wikipedia_sections.extend(all_subsections_from_title(title))
print(f"Found {len(wikipedia_sections)} sections in {len(titles)} pages.")

  and should_run_async(code)


text:  {{Short description|Indian economic crisis of 1991}}
{{Use dmy dates|date=May 2020}}
{{Use Indian English|date=July 2015}}
The '''1991 Indian economic crisis''' was an economic crisis in India resulting from a [[balance of payments]] deficit due to excess reliance on imports and other external factors.<ref>{{cite web | url=https://www.imf.org/external/pubs/ft/staffp/2002/03/pdf/cerra.pdf | title=What Caused the 1991 Currency Crisis in India? | publisher=International Monetary Fund | work=VALERIE CERRA and SWETA CHAMAN SAXENA | access-date=7 May 2023}}</ref> India's economic problems started worsening in 1985 as imports swelled, leaving the country in a [[twin deficit]]: the Indian trade balance was in deficit at a time when the government was running on a huge fiscal deficit (although the twin-deficit hypothesis is disputed).<ref name="originalreport">{{cite web |title=India - Structural Adjustment Credit Project (English) - Presidents report |url=http://documents.worldbank.org/

In [103]:
wikipedia_sections

  and should_run_async(code)


[(['1991 Indian economic crisis'],
  '{{Short description|Indian economic crisis of 1991}}\n{{Use dmy dates|date=May 2020}}\n{{Use Indian English|date=July 2015}}\nThe \'\'\'1991 Indian economic crisis\'\'\' was an economic crisis in India resulting from a [[balance of payments]] deficit due to excess reliance on imports and other external factors.<ref>{{cite web | url=https://www.imf.org/external/pubs/ft/staffp/2002/03/pdf/cerra.pdf | title=What Caused the 1991 Currency Crisis in India? | publisher=International Monetary Fund | work=VALERIE CERRA and SWETA CHAMAN SAXENA | access-date=7 May 2023}}</ref> India\'s economic problems started worsening in 1985 as imports swelled, leaving the country in a [[twin deficit]]: the Indian trade balance was in deficit at a time when the government was running on a huge fiscal deficit (although the twin-deficit hypothesis is disputed).<ref name="originalreport">{{cite web |title=India - Structural Adjustment Credit Project (English) - Presidents re

In [106]:
# clean text
def clean_section(section: tuple) -> tuple:
    """
    Return a cleaned up section with:
        - <ref>xyz</ref> patterns removed
        - leading/trailing whitespace removed
    """
    titles, text = section
    text = re.sub(r"<ref.*?</ref>", "", text)
    text = text.strip()
    return (titles, text)


wikipedia_sections = [clean_section(ws) for ws in wikipedia_sections]

# filter out short/blank sections
def keep_section(section: tuple) -> bool:
    """Return True if the section should be kept, False otherwise."""
    titles, text = section
    if len(text) < 16:
        return False
    else:
        return True


original_num_sections = len(wikipedia_sections)
wikipedia_sections = [ws for ws in wikipedia_sections if keep_section(ws)]
print(f"Filtered out {original_num_sections-len(wikipedia_sections)} sections, leaving {len(wikipedia_sections)} sections.")

Filtered out 1 sections, leaving 6 sections.


  and should_run_async(code)


In [107]:
# print example data
for ws in wikipedia_sections[:5]:
    print(ws[0])
    display(ws[1][:77] + "...")
    print()

['1991 Indian economic crisis']


  and should_run_async(code)


'{{Short description|Indian economic crisis of 1991}}\n{{Use dmy dates|date=May...'


['1991 Indian economic crisis', '== Causes and conscious ==', '=== Context ===']


'During the 1970s, the International Monetary Fund (IMF) began to increasingly...'


['1991 Indian economic crisis', '== Recovery ==']


"{{Further|Economic liberalisation in India}}\nWith India's [[foreign exchange ..."


['1991 Indian economic crisis', '== Recovery ==', '=== Under the Narasimha Rao government===']


'[[P. V. Narasimha Rao]] took over as [[Prime Minister of India|Prime Minister...'


['1991 Indian economic crisis', '== Aftermath ==']


'{{See also|Economic liberalisation in India#Criticisms}}\nSince the implementa...'




In [110]:
wikipedia_sections

  and should_run_async(code)


[(['1991 Indian economic crisis'],
  "{{Short description|Indian economic crisis of 1991}}\n{{Use dmy dates|date=May 2020}}\n{{Use Indian English|date=July 2015}}\nThe '''1991 Indian economic crisis''' was an economic crisis in India resulting from a [[balance of payments]] deficit due to excess reliance on imports and other external factors. India's economic problems started worsening in 1985 as imports swelled, leaving the country in a [[twin deficit]]: the Indian trade balance was in deficit at a time when the government was running on a huge fiscal deficit (although the twin-deficit hypothesis is disputed).\n\nThe fall of the [[Eastern Bloc]], which had trade relations with India and allowed for rupee exchange, posed significant issues. Towards the end of 1990, leading up to the Gulf War, the situation became dire. India's foreign exchange reserves were not enough to finance three weeks' worth of imports. Additionally, the Iraq-Kuwait conflict caused a significant shift in the trad

In [112]:
def split_strings_from_subsection(
    subsection: tuple[list[str], str],
    max_tokens: int = 1000,
    model: str = GPT_MODEL,
    max_recursion: int = 5,
) -> list[str]:
    """
    Split a subsection into a list of subsections, each with no more than max_tokens.
    Each subsection is a tuple of parent titles [H1, H2, ...] and text (str).
    """
    titles, text = subsection
    string = "\n\n".join(titles + [text])
    num_tokens_in_string = num_tokens(string)
    # if length is fine, return string
    if num_tokens_in_string <= max_tokens:
        return [string]
    # if recursion hasn't found a split after X iterations, just truncate
    elif max_recursion == 0:
        return [truncated_string(string, model=model, max_tokens=max_tokens)]
    # otherwise, split in half and recurse
    else:
        titles, text = subsection
        for delimiter in ["\n\n", "\n", ". "]:
            left, right = halved_by_delimiter(text, delimiter=delimiter)
            if left == "" or right == "":
                # if either half is empty, retry with a more fine-grained delimiter
                continue
            else:
                # recurse on each half
                results = []
                for half in [left, right]:
                    half_subsection = (titles, half)
                    half_strings = split_strings_from_subsection(
                        half_subsection,
                        max_tokens=max_tokens,
                        model=model,
                        max_recursion=max_recursion - 1,
                    )
                    results.extend(half_strings)
                return results
    # otherwise no split was found, so just truncate (should be very rare)
    return [truncated_string(string, model=model, max_tokens=max_tokens)]

  and should_run_async(code)


NameError: name 'GPT_MODEL' is not defined

In [111]:
# split sections into chunks
MAX_TOKENS = 1600
wikipedia_strings = []
for section in wikipedia_sections:
    wikipedia_strings.extend(split_strings_from_subsection(section, max_tokens=MAX_TOKENS))

print(f"{len(wikipedia_sections)} Wikipedia sections split into {len(wikipedia_strings)} strings.")

  and should_run_async(code)


NameError: name 'split_strings_from_subsection' is not defined