In [36]:
import yaml 
import requests
import pandas as pd
import sqlite3
from tqdm import tqdm
from bs4 import BeautifulSoup
from fpdf import FPDF
conn = sqlite3.connect("data/database.db")

In [37]:
tickers = pd.read_sql_query("SELECT * FROM master_ticker", conn)["Security"].drop_duplicates().tolist()
tickers = [t.replace(" ", "_") for t in tickers]

In [38]:
def create_pdf(text, filename="output.pdf"):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin = 15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    # Split text into lines that fit in the PDF
    lines = pdf.multi_cell(0, 10, text)

    pdf.output(filename)
    print(f"PDF saved as {filename}")

In [39]:
def extract_text_from_wikipedia(ticker):
    URL_BASE = "https://en.wikipedia.org/wiki/"
    request = requests.get(URL_BASE + ticker)
    soup = BeautifulSoup(request.text, "html.parser")
    pars = []
    for p in soup.find_all('p'):
        pars.append(str(p.text))
    
    text = "".join(pars).encode('latin-1', 'replace').decode('latin-1')
    print(text)
    
    create_pdf(text, filename="data/wikipedia/"+ str(ticker)+".pdf")

In [40]:
class WikipediaPDFGenerator:
    def __init__(self, output_dir="data/wikipedia/"):
        self.output_dir = output_dir
    
    def create_pdf(self, text, filename="output.pdf"):
        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()
        pdf.set_font("Arial", size=12)
        # Split text into lines that fit in the PDF
        pdf.multi_cell(0, 10, text)
        full_path = f"{self.output_dir}{filename}"
        pdf.output(full_path)
        print(f"PDF saved as {full_path}")

    def extract_text_from_wikipedia(self, ticker):
        URL_BASE = "https://en.wikipedia.org/wiki/"
        request = requests.get(URL_BASE + ticker)
        soup = BeautifulSoup(request.text, "html.parser")
        pars = [p.text for p in soup.find_all('p')]
        text = "".join(pars).encode('latin-1', 'replace').decode('latin-1')
        self.create_pdf(text, filename=f"{ticker}.pdf")

In [41]:
generator = WikipediaPDFGenerator()
for t in tickers:
    generator.extract_text_from_wikipedia(ticker=t)

PDF saved as data/wikipedia/3M.pdf
PDF saved as data/wikipedia/A._O._Smith.pdf
PDF saved as data/wikipedia/Abbott.pdf
PDF saved as data/wikipedia/AbbVie.pdf
PDF saved as data/wikipedia/Accenture.pdf
PDF saved as data/wikipedia/Adobe_Inc..pdf
PDF saved as data/wikipedia/Advanced_Micro_Devices.pdf
PDF saved as data/wikipedia/AES_Corporation.pdf
PDF saved as data/wikipedia/Aflac.pdf
PDF saved as data/wikipedia/Agilent_Technologies.pdf
PDF saved as data/wikipedia/Air_Products_and_Chemicals.pdf
PDF saved as data/wikipedia/Airbnb.pdf
PDF saved as data/wikipedia/Akamai.pdf
PDF saved as data/wikipedia/Albemarle_Corporation.pdf
PDF saved as data/wikipedia/Alexandria_Real_Estate_Equities.pdf
PDF saved as data/wikipedia/Align_Technology.pdf
PDF saved as data/wikipedia/Allegion.pdf
PDF saved as data/wikipedia/Alliant_Energy.pdf
PDF saved as data/wikipedia/Allstate.pdf
PDF saved as data/wikipedia/Alphabet_Inc._(Class_A).pdf
PDF saved as data/wikipedia/Alphabet_Inc._(Class_C).pdf
PDF saved as data/w