## 1. Scraper

The purpose of this notebook is to build a webscraper for r/LocalLlama using beautiful soup

In [3]:
import os
import datetime
from time import sleep
from datetime import datetime, timedelta

import praw
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv

## Just using BeautiflSoup

NOTE: because of the elements being dynamically named by reddit, it's extremely difficult to have consistent extraction of html elements.

In [None]:
import requests
from bs4 import BeautifulSoup

# Define the URL of the subreddit
url = 'https://www.reddit.com/r/LocalLLaMA/hot/'

# Add headers to mimic a real browser visit. Reddit checks for User-Agent.
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    now = datetime.now()
    posts_details = {}

    # Loop through each post article
    for article in soup.find_all('article'):
        # Extract title
        title_tag = article.find('a', {'slot': 'title'})
        if title_tag:
            title = title_tag.text.strip()
            link = title_tag['href']

            content = article.find('div', {'data-post-click-location': 'text-body'})
            content_text = content.text.strip() if content else "No content"
            
            # Store details in dictionary
            posts_details[title] = {'link': link, 'content': content_text.rstrip().replace("\n", "")}

    # Print the details of posts from the last day
    for title, details in posts_details.items():
        print(f"Title: {title}\nLink: {details['link']}\nContent: {details['content']}\n")
else:
    print(f"Failed to retrieve the webpage: Status code {response.status_code}")


## Just using BeautiflSoup and Selenium

NOTE: this got me blocked because it came across as bot activity

In [None]:

# Set up the Selenium driver (example with Chrome)
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in the background
driver = webdriver.Chrome(options=options)

# URL of the subreddit's new posts
url = 'https://www.reddit.com/r/LocalLLaMA/hot/'

driver.get(url)
sleep(2)  # Wait for the initial page to load

# Scroll down to ensure all posts from the last day are loaded
# Adjust the range or conditions based on your needs
for _ in range(10):  # Example: scroll 10 times
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    sleep(2)  # Wait for more posts to load

def parse_datetime(timestamp):
    return datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%f+0000")

# Now that we have the page loaded, let's use BeautifulSoup to parse the HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

print(soup.title.text)
posts = soup.find_all('article')
print(f"Found {len(posts)} posts on the page.")

# If posts are found, attempt to print the first post's HTML
if posts:
    print(posts[0].prettify())

# Close the Selenium browser
driver.quit()
now = datetime.now()

# Dictionary to hold post details
posts_details = {}

# Loop through each post article
for article in soup.find_all('article'):
    # Extract title
    title_tag = article.find('a', {'slot': 'title'})
    if title_tag:
        title = title_tag.text.strip()
        link = title_tag['href']
        
        # Extract timestamp
        timestamp_tag = article.find('faceplate-timeago')
        if timestamp_tag:
            post_datetime = parse_datetime(timestamp_tag['ts'])
            # Check if the post is from the last day
            if now - post_datetime <= timedelta(days=1):
                # Extract content if available
                content = article.find('div', {'data-post-click-location': 'text-body'})
                content_text = content.text.strip() if content else "No content"
                
                # Store details in dictionary
                posts_details[title] = {'link': link, 'content': content_text}

# Print the details of posts from the last day
for title, details in posts_details.items():
    print(f"Title: {title}\nLink: {details['link']}\nContent: {details['content']}\n")

In [None]:
posts_details

### PRAW For Reddit

1. Go to https://www.reddit.com/prefs/apps
2. Click "create app" or "create another app"
3. Fill out the form:
    - name: Give your app a name.
    - application type: Select "script".
    - redirect uri: Use http://localhost:8080 or a similar placeholder.
4. After creation, note down the `client_id` (just under the app name) and `client_secret`.

In [16]:
load_dotenv("../../../.env")

True

In [19]:
reddit = praw.Reddit(
    user_agent=f"Comment Extraction (by u/{os.environ.get('REDDIT_USERNAME')})",
    client_id= os.environ.get("REDDIT_APP"),
    client_secret= os.environ.get("REDDIT_SECRET")
)


In [68]:
subreddit = reddit.subreddit("LocalLLaMA")
new_posts = subreddit.new(limit=1000)  # Adjust limit as needed

posts_data = {}

def get_comments_with_replies(comment, depth=0):
    """Recursively get comments and their nested replies."""
    comments_list = []
    spacer = "  " * depth  # Indentation for nested comments
    comments_list.append(f"{spacer}- {comment.body}")
    if not hasattr(comment, "replies"):
        comment.replies = []
    for reply in comment.replies:
        if isinstance(reply, praw.models.MoreComments):
            continue
        comments_list.extend(get_comments_with_replies(reply, depth + 1))
    return comments_list


for post in new_posts:
    post.comments.replace_more(limit=0)  # Load all comments; limit=0 to fully expand the comment tree
    comments = []
    for top_level_comment in post.comments:
        comments.extend(get_comments_with_replies(top_level_comment))

    posts_data[post.id] = {
        "title": post.title,
        "content": post.selftext,
        "comments": comments,
        "link": post.shortlink,
        "flair": post.flair, 
        "media": post.media,
    }

In [110]:
with open(f"./_output/new/localllama-new-{datetime.now().strftime('%d-%m-%Y')}.txt", "w") as file:
    # Print the scraped data
    for post_id, post_info in posts_data.items():
        file.writelines(f"Post ID: {post_id}\n")
        file.writelines(f"Title: {post_info['title']}\n")
        file.writelines(f"Link: {post_info['link']}\n")
        file.writelines(f"Content: {post_info['content']}\n")
        if len(post_info['comments']) > 0:
            file.writelines("Comments:\n")
            for comment in post_info['comments']:
                file.writelines(comment + "\n")  # Comments are already formatted with spacers for nesting
        file.writelines("\n---\n")