In [1]:
# asynchronous function execution 
import pickle 
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
import time

from pprint import pprint 
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

True

In [2]:
async def say_after(delay, what):
    await asyncio.sleep(delay)
    print(what)

async def main():
    # both tasks are ran at the same time however the second task takes longer to complete without blocking the first task 
    task1 = asyncio.create_task(
        say_after(1, 'hello'))

    task2 = asyncio.create_task(
        say_after(2, 'world'))

    print(f"started at {time.strftime('%X')}")
    await task1
    print(f"finished at {time.strftime('%X')}")
    await task2
    print(f"finished at {time.strftime('%X')}")

# Use await directly
await main()

started at 17:41:14
hello
finished at 17:41:15
world
finished at 17:41:16


In [15]:
# this is nearly the same example as above
# however here we have a function for each unique task 
# where as above the same function was used to create both tasks 

nest_asyncio.apply() # allows nested use of asyncio.run inside notebooks

async def fetch_data():
    print("Fetching data...task 1")
    await asyncio.sleep(1)  # Simulates a non-blocking I/O operation
    return "task 1 Data received!"

async def fetch_more_data():
    print("Fetching more data...task 2")
    await asyncio.sleep(3)  # Simulates a different non-blocking I/O operation
    return "task 2 data received!"

async def main():
    # Start both co-routines concurrently
    task1 = asyncio.create_task(fetch_data())
    task2 = asyncio.create_task(fetch_more_data())

    #### Case 1) 
    # Wait for both tasks to complete before main() terminates 
    # await python keyword indicates that both tasks will be ran asynchronously 
    # await keyword is used inside functions that are declared with with "async def"
    
    # The use of "async" and "await" means that the function will wait for each task to finish running
    # without blocking the execution of other tasks 
    result1 = await task1
    result2 = await task2
    
    #### Case 2)
    # Don't wait for task 2 to complete before main() terminates 
    # Result: Function terminates without "task 2 data received!" print out
    # Function waited for task 1 to terminate only 
    # result1 = await task1
    # result2 =  task2
    
    #### Case 2)
    # Don't wait for task 1 nor task 2 to complete before main() terminates
    # Result: Function terminated without waiting for either task to complete 
    # As soon as the function ran out of lines of code to run, it terminated 
    # Neither "task 1 Data received!" nor "task 2 data received!" printed out 
    # result1 = task1
    # result2 = task2

    print(result1)
    print(result2)

# Run the async function
asyncio.run(main())

Fetching data...task 1
Fetching more data...task 2
task 1 Data received!
task 2 data received!


In [4]:
nest_asyncio.apply() # allows nested use of asyncio.run within jupyter notebooks


async def scrape_tweets(): # main function 
    
    async with async_playwright() as p: 
        
        # Launch the browser and open a new page
        browser = await p.chromium.launch(headless=True)  # Set headless=True if you want to run it in the background
        page = await browser.new_page()

        # Maximize the window
        await page.set_viewport_size({"width": 1920, "height": 1080})
        await page.goto("https://x.com/apify")
        
        # Wait for the page to load completely
        await page.wait_for_timeout(10000)  # Wait for 10 seconds

        tweets = set()  # Use a set to avoid duplicates

        while len(tweets) < 5:  # Continue scrolling until we have at least 20 tweets
            # Locate all the div elements with attribute name "lang"
            tweet = ""
            elements = await page.locator('div[lang]').element_handles()
            for element in elements:
                text = await element.inner_text()
                tweets.add(text)

            # Scroll down
            await page.evaluate('window.scrollBy(0, window.innerHeight)')
            await page.wait_for_timeout(2000)  # Wait for 2 seconds to load more tweets
            
            
        
        await browser.close()
        
    # save tweets
    with open('my_list.pkl', 'wb') as f:
        pickle.dump(tweets, f) 

# Run the async function
asyncio.run(scrape_tweets())


In [5]:
with open('my_list.pkl', 'rb') as f:  # Use 'rb' to read the file in binary mode
    my_list = list(pickle.load(f))

In [6]:
my_list

["And finally there's also an universal #webscraping and #crawling library for #JavaScript / #NodeJS, similar to \n@ScrapyProject\n for #Python that was around for years, but also working with headless Chrome and Puppeteer. Better late than never  https://github.com/apifytech/apify-js…",
 '15,000+ Stars on \n@GitHub\n  \n\nWe are giving away 5 Crawlee hoodies to celebrate this milestone \n\nRules:\n\n- Follow \n@apify\n \n- Retweet this tweet \n- Star us at: https://apify.it/3TDOig8 \n- Reply with your GitHub username \n\n Deadline: October 1st 2024',
 'Today we’re launching Crawlee on Product Hunt \n \nPlease show us some  and support, to encourage the team who spent years building this open-source library!\n \nLearn more about Crawlee in the thread \n \n1/5\n\nhttps://producthunt.com/posts/crawlee',
 '12,500+ Stars on \n@GitHub\n \n\nWe are giving away 3 Crawlee shirts to celebrate this milestone \n\nRules:\n\n- Follow \n@apify\n \n- Retweet this tweet \n- Star us at: https://apify.i

In [22]:
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup

def custom_extractor(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    return soup.get_text()

loader = RecursiveUrlLoader(
    "https://glendalehs.gusd.net/",
    max_depth=2,
    use_async=True,
    extractor=None,
    metadata_extractor=None,
    exclude_dirs=(),
    timeout=10,
    check_response_status=True,
    continue_on_failure=True,
    prevent_outside=True,
    base_url=None
)

In [23]:
docs = []
docs_lazy = loader.lazy_load()

# async variant:
# docs_lazy = await loader.alazy_load()

for doc in docs_lazy:
    docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)

Unable to load https://glendalehs.gusd.net/8182423161. Received error Received HTTP status 404 of type ValueError
Unable to load https://glendalehs.gusd.net/8182446309. Received error Received HTTP status 404 of type ValueError
Unable to load https://glendalehs.gusd.net/29730_2. Received error Cannot connect to host www.ghsclassof73reunion.com:443 ssl:default [nodename nor servname provided, or not known] of type ClientConnectorDNSError



    <!DOCTYPE html>
<html lang="en">
<head>
    <!-- needed for form submission -->
<script id="goo
{'source': 'https://glendalehs.gusd.net/', 'content_type': 'text/html; charset=UTF-8', 'title': 'Glendale High School - Home', 'description': 'Glendale High School', 'language': 'en'}


In [30]:
# scraps tab names and drop down menu titles 
from bs4 import BeautifulSoup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader

# Define the URL of your college website
url = "https://glendalehs.gusd.net/"

# Define a custom extractor function to extract text from HTML using BeautifulSoup
def custom_extractor(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    return soup.get_text()

# Instantiate the RecursiveUrlLoader
# TODO: Sanity Check performance of RecursiveUrlLoader vs. Playwrite for crawling capabilities 
loader = RecursiveUrlLoader(url=url, 
                            extractor=custom_extractor, 
                            max_depth=1, # TODO: incorporate chunking so we can scrap more than a single page 
                            use_async=True)

# Load the data from the website ( 3 options )

##  1) This method is blocking and synchronous. It fully loads all documents into memory in one go before the function returns.
# docs = loader.load()  

## 2) This method returns a generator. Instead of loading all documents at once, it yields documents one by one as they are fetched. 
##    This means you can process each document as soon as it is available, without waiting for all documents to load.
#docs = loader.lazy_load()

## 3) This method is used in an asynchronous context. It allows for non-blocking, asynchronous fetching of documents. 
##    You can perform other tasks while waiting for documents to be fetched.
docs = await loader.alazy_load()

# Define the file path to store the data
output_file = "scraped__data.txt"

# Open the file in write mode with UTF-8 encoding
with open(output_file, "w", encoding="utf-8") as file:
    # Write metadata and content for each document to the file
    for doc in docs:
        title = doc.metadata.get("title")
        source = doc.metadata.get("source")
        content = doc.page_content

        # Ensure that the title, source, and content are string type
        if isinstance(title, str) and isinstance(source, str) and isinstance(content, str):
            file.write("Page Title: " + title + "\n")
            file.write("Page URL: " + source + "\n")
            file.write("Page Content:\n" + content + "\n\n")
        else:
            print("Skipped a document due to non-string content.")

print("Data has been successfully written to", output_file)

TypeError: object async_generator can't be used in 'await' expression

# Build Crawler

#### Version 1 of Crawler works 
- Scraps multiple pages 
- Filters text for activities and events 
- Crafts engaging email for students and includes a link 

#### ToDo
- Move text to VectorDB
- Scrap Images of events 
- Scrap Names and Titles of Administrators 

In [2]:
# imports
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright, Playwright
from langchain.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import AsyncChromiumLoader
from bs4 import BeautifulSoup
from langchain_chroma import Chroma
from openai import OpenAI
from uuid import uuid4
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

# apply nest_asyncio to enable running async functions in Jupyter
nest_asyncio.apply()

In [3]:
### playwright TAKES TOO LONG TO SCRAPE, RETURNS REDUNDANT RESULTS, AND SCRAPS TOO MUCH SUB-PAGES
### TODO: maybe stop using async_playwright to stop redundant results 
### TODO: FILTER NUMBER OF SUB-PAGES IT SCRAPS 
class Crawler():
    def __init__(self):
        self.async_playwright =  async_playwright
    
    def custom_extractor(self, html_content):
        soup = BeautifulSoup(html_content, "html.parser")
        return soup.get_text()

    # Step 3: Define an async function to load documents using loader.lazy_load()
    async def fetch_documents(self, url):
        # Initialize the RecursiveUrlLoader with a sample URL
        # TODO: REPLACE RecursiveUrlLoader WITH PLAYWRITE FOR DYNAMICALLY LOADED WEBSITES 
        loader = RecursiveUrlLoader(url = url, 
                                    extractor=self.custom_extractor, 
                                    max_depth=1, # TODO: incorporate chunking so we can scrap more than a single page 
                                    use_async=True)

            
        return [doc for doc in  loader.lazy_load()]
    
    async def crawl(self, url):
        docs =  await self.fetch_documents(url) 
        return docs
    
    async def fetch_documents_dynamic(self, url, playwright: Playwright):
        chromium = playwright.chromium # or "firefox" or "webkit".
        browser = await chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        texts = await page.get_by_role("link").all_text_contents()
        await browser.close()
        return texts

    async def crawl_dynamic(self, url):
        async with self.async_playwright() as playwright:
            docs = await self.fetch_documents_dynamic(url, playwright)
            return set(docs)
        

In [43]:
class FilterWebPage():
    def __init__(self, directory=None):
        self.llm = OpenAI()
        self.clean_doc = ""
        
        if directory is None: 
            self.dir = "../scraped_sites"
        else:
            self.dir = directory
        
    def filename_prompt(self, URL):
        return f"""
            Use this URL to create a simple and descriptive file name for a text file in python.  
            The URL will contain the name of a school, focus on extracting the name of the school. 
            URL {URL} 
            Only return the file name.
            """

    def crawler_prompt(self, doc):
        return f"""
            The following document is the results of scrapping a website. Many unless words have been scrapped such as the title of drop down tabs and their sub-titles. 
            For example, "Athletics" is the title of a drop down tab and "Sports Calendar" and "Coach Contact Information" is unhelpful information. 
            Conversely, there is useful information in the document that describes specific events and activities that have taken place or will take place. 
            This useful information is usually, but not always, provided as a phrase, sentence, or paragraph, make sure they have sufficient context 
            in order to understand what kind of event or activity it is, an event or activity without no context isn't useful. For example "Garden Club" on its own is not useful. 

            Your goal is to identify and maintain the useful information while removing the useless information. 
            Attempt to identify related information and output it as a single string. 
            For example, if there is a dance event with dates and contact information, that should all be a single string 
            where as information about a completely different event or activity (such as a fundraiser) should be a separate string that you output. 
            Separate the strings using a newline character. 

            If a document doesn't have any useful information, return a white space.

            Document: {doc}
            """
        
    def get_filename(self, URL):
        completion =  OpenAI().chat.completions.create(
        model="gpt-4o-mini",
        max_tokens=5500,
        temperature=0.0,
        messages=[
            {"role": "system", f"content": self.filename_prompt(URL)}
            ]
        )
        self.filename = completion.choices[0].message.content
        self.school_name = completion.choices[0].message.content.strip(".txt")

    def filter_doc(self, doc):
        for doc in docs: 
            completion = self.llm.chat.completions.create(
                model="gpt-4o-mini",
                max_tokens=5500,
                temperature=0.0,
                messages=[
                    {"role": "system", "content": self.crawler_prompt(doc)}
                ]
            )
            
            self.clean_doc = completion.choices[0].message.content

        ## TODO: NEED TO AGGRAGATE RESULTS 
        
    def save_clean_doc(self):
        fn = "/".join((self.dir, self.filename))
        #doc = " ".join(self.clean_doc)
        with open(fn, "w", encoding="utf-8") as f:
            f.write(self.clean_doc)
            

class VectorStore():
    def __init__(self):
        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        self.vector_store = Chroma(
                collection_name="school_collection",
                embedding_function=self.embeddings,
                persist_directory="../chroma_langchain_db",  # Where to save data locally, remove if not necessary
            )

    def add_document(self, doc, metadata):
        # TODO
        # need to use hashtags or metadata for actual IDing 
        # then use the query to further filter the type of docs from that school 
        # alternative: create human readable ids like "<school_name>_<doc_type>_<datetime>_<doc number>"
        document = Document(
            page_content=doc,
            metadata={"source": metadata}
        )
        
        documents = [document]
        ids = [str(uuid4()) for _ in documents]
        
        # Add documents with UUIDs as IDs
        self.vector_store.add_documents(documents=documents, ids=ids)
    
    def retrieve_document(self, query, metadata=None, k=1):
        results = self.vector_store.similarity_search_with_relevance_scores(
            query, # for now, ask for events/activities that take place at the school 
            k=k, # number of docs to retrieve
            filter={"source": metadata}, # in the future we will also have documents with names and job titles
        )
        return results
    
    def get_school_names(self):
        # all metadata tags are school names 
        # returns list of unique school names
        return list(set(dic['source']  for dic in self.vector_store.get()['metadatas']))


In [56]:
from datetime import datetime


class PhishingEmail():
    def __init__(self, phishing_link='https://example.org/'):
        pass
        self.today = datetime.today().date()
        self.llm = OpenAI()
        self.phishing_link = phishing_link
        

    def prompt(self, doc, phishing_link):
        if phishing_link is not None:
            self.phishing_link = phishing_link
            

        return f"""
        The following document contains information about multiple school activities and events.  
        Your task is to select one event and use that info to write a persuasive email to encourage a student to click on an event related link.
        Keep the email short, simple, but most of all persuasive and use language that would appeal to a high school student such as use of emojis.
        Keep today's date in mind as you describe event dates in the email, don't provide a past date for a future event.  

        DOCUMENT: {doc}
        LINK: {self.phishing_link}
        TODAY'S DATE: {self.today}
        """

    def create_email(self, doc, phishing_link=None):
        completion = self.llm.chat.completions.create(
            model="gpt-4o-mini",
            max_tokens=5500,
            temperature=0.0,
            messages=[
                {"role": "system", "content": self.prompt(doc, phishing_link)}
            ]
        )

        return completion.choices[0].message.content

In [8]:
urls = ['https://rdwhite.gusd.net/', 
        'https://wilson.gusd.net/', 
        "https://glendalehs.gusd.net/"]

filter_page = FilterWebPage()
crawler = Crawler()
vs = VectorStore()
pe = PhishingEmail()

for url in urls: 
        ### scrape URL
        docs = await crawler.crawl(url) 
        #docs = asyncio.run(crawler.crawl_dynamic(url))
        # filter scaped pages for useful content 
        filter_page.filter_doc(docs) 
        # create filename using school name from URL
        filter_page.get_filename(url)
        # save to file 
        # filter_page.save_clean_doc()
        #TODO: this workflow assumes that only a single page regarding activities is in the results
        # not true once we start scraping names and titles 
        vs.add_document(filter_page.clean_doc, filter_page.school_name) 

In [60]:
# this loop's logic works because there is currently one doc per school 
school_names = vs.get_school_names()


emails = []
for school in school_names:
    doc = vs.retrieve_document("activities and events for glendale high school", school)
    doc = doc[0][0].page_content
    phishing_email = pe.create_email(doc)
    emails.append(phishing_email)

In [61]:
emails

['Subject: 🌟 Exciting Opportunity: Armenian Dual Immersion Program! 🌟\n\nHey there!\n\nAre you ready to take your language skills to the next level? 🗣️✨ R.D. White Elementary School is now accepting applications for the Armenian Dual Immersion Program for the 2025-26 school year! This is an amazing chance to immerse yourself in the Armenian language and culture while making new friends and having fun! 🎉\n\nImagine being part of a vibrant community where you can learn and grow together. Plus, it looks great on college applications! 📚💪\n\nDon’t miss out! Click the link below to learn more and apply today! ⬇️\n\n👉 [Apply Now!](https://example.org/)\n\nLet’s make your high school experience unforgettable! \n\nBest,  \n[Your Name]',
 "Subject: 🎓 Don't Miss Out on Financial Aid Workshop! 💰\n\nHey [Student's Name]! \n\nAre you ready to tackle college expenses? 🏫💸 Join us for the **Understanding Financial Aid for College** workshop on **Wednesday, Dec. 4th from 6:30 - 8:30 pm** at Hoover High 

In [62]:
pprint(emails[-1])

("Subject: 🎉 Don't Miss Out on the Fall Dance! 🕺💃\n"
 '\n'
 "Hey [Student's Name]! \n"
 '\n'
 "I hope you're having an awesome day! 🌟 I wanted to remind you about the Fall "
 'Dance happening on **November 17th** at Woodrow Wilson Middle School! 🎶✨ '
 'It’s going to be a night full of fun, friends, and fantastic music! \n'
 '\n'
 'Imagine dancing the night away with your friends, enjoying delicious snacks, '
 'and making unforgettable memories! 🎉 Plus, it’s a great chance to show off '
 'your best dance moves! 💃🕺\n'
 '\n'
 'You definitely don’t want to miss this! Click the link below for more '
 'details and to get your tickets! 🎟️👇  \n'
 '[Get Your Tickets Here!](https://example.org/)\n'
 '\n'
 'Let’s make this dance the best one yet! Can’t wait to see you there! \n'
 '\n'
 'Best,  \n'
 '[Your Name]  \n'
 'Woodrow Wilson Middle School 🌟')
