## 1. Scraper

The purpose of this notebook is to build a webscraper for r/LocalLlama using beautiful soup

In [23]:
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import datetime
from datetime import datetime, timedelta

# Set up the Selenium driver (example with Chrome)
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in the background
driver = webdriver.Chrome(options=options)

# URL of the subreddit's new posts
url = 'https://www.reddit.com/r/LocalLLaMA/hot/'

driver.get(url)
sleep(2)  # Wait for the initial page to load

# Scroll down to ensure all posts from the last day are loaded
# Adjust the range or conditions based on your needs
for _ in range(10):  # Example: scroll 10 times
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    sleep(2)  # Wait for more posts to load

def parse_datetime(timestamp):
    return datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%f+0000")

# Now that we have the page loaded, let's use BeautifulSoup to parse the HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

print(soup.title.text)
posts = soup.find_all('article')
print(f"Found {len(posts)} posts on the page.")

# If posts are found, attempt to print the first post's HTML
if posts:
    print(posts[0].prettify())

# Close the Selenium browser
driver.quit()
now = datetime.now()

# Dictionary to hold post details
posts_details = {}

# Loop through each post article
for article in soup.find_all('article'):
    # Extract title
    title_tag = article.find('a', {'slot': 'title'})
    if title_tag:
        title = title_tag.text.strip()
        link = title_tag['href']
        
        # Extract timestamp
        timestamp_tag = article.find('faceplate-timeago')
        if timestamp_tag:
            post_datetime = parse_datetime(timestamp_tag['ts'])
            # Check if the post is from the last day
            if now - post_datetime <= timedelta(days=1):
                # Extract content if available
                content = article.find('div', {'data-post-click-location': 'text-body'})
                content_text = content.text.strip() if content else "No content"
                
                # Store details in dictionary
                posts_details[title] = {'link': link, 'content': content_text}

# Print the details of posts from the last day
for title, details in posts_details.items():
    print(f"Title: {title}\nLink: {details['link']}\nContent: {details['content']}\n")

KeyboardInterrupt: 

In [13]:
posts_details

{}

In [24]:
import requests
from bs4 import BeautifulSoup

# Define the URL of the subreddit
url = 'https://www.reddit.com/r/LocalLLaMA/hot/'

# Add headers to mimic a real browser visit. Reddit checks for User-Agent.
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    now = datetime.now()
    posts_details = {}

    # Loop through each post article
    for article in soup.find_all('article'):
        # Extract title
        title_tag = article.find('a', {'slot': 'title'})
        if title_tag:
            title = title_tag.text.strip()
            link = title_tag['href']

            content = article.find('div', {'data-post-click-location': 'text-body'})
            content_text = content.text.strip() if content else "No content"
            
            # Store details in dictionary
            posts_details[title] = {'link': link, 'content': content_text.rstrip().replace("\n", "")}

    # Print the details of posts from the last day
    for title, details in posts_details.items():
        print(f"Title: {title}\nLink: {details['link']}\nContent: {details['content']}\n")
else:
    print(f"Failed to retrieve the webpage: Status code {response.status_code}")


Title: High-VRAM GPUS for us nerds.
Link: /r/LocalLLaMA/comments/1asfe83/highvram_gpus_for_us_nerds/
Content: There are currently no (reasonably priced) graphics cards with a lot of VRAM (>= 64GB) to run large models.      My expectation is, at some point, some manufacturer will make those happen. But I'm wondering if we (as a community) can make it happen sooner.      VRAM is not that expensive (https://www.tomshardware.com/news/gddr6-vram-prices-plummet), so something like a 1060 with 64 or 128GB of RAM shouldn't be too expensive. Unless there is some technical reason this can't be done cheaply (or at all) that I'm missing, please enlighten my naive ass.      Personally, if I'm going to put 900 euros into a graphics card, I'd rather it has fewer CUDA cores than a 3090 but more RAM than a 3090. Not sure about others here.      Here are some solutions I can imagine:  1. Harass large manufacturers.    If we all collectively email (or social-media-spam) large manufacturers of GPUs / grap

In [25]:
posts_details.keys()

dict_keys(['High-VRAM GPUS for us nerds.'])