### Getting Reddit Posts using BeautifulSoup

In [2]:
import requests
import csv
import time
from bs4 import BeautifulSoup

Specify the url for the webpage that we want to scrape. we're using [Reddit's old website](https://old.reddit.com) for simplicity. Before we start writing the script, first we take a look to the website and see its structure 

### Getting the Page with BeautifulSoup

In [19]:
# First, we need to request the web page using the ‘requests’ library.
url = "https://old.reddit.com"

# Headers to simulate a browser visit.
headers = {'User-Agent': 'Mozilla/5.0'}

# Returns a response object which contains the entire source code of the HTML file.
response = requests.get(url, headers=headers)

### Finding the tags

In [20]:
# Create a list to store the posts
posts_data = []

# Loop through the pages
while True:
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all the posts in the page using the class name 'thing' which is the class name of the div tag that contains the posts.
    posts = soup.find_all('div', class_='thing')

    for post in posts:
        # Create a dictionary to store the post data
        post_data = {}
        # Get the title element of the post
        title_element = post.find('p', class_="title")
        # Check if the post has a title and add it to the dictionary
        post_data['Title'] = title_element.text if title_element else "N/A"
        author_element = post.find('a', class_='author')
        # Check if the post has an author and add it to the dictionary
        post_data['Author'] = author_element.text if author_element else "N/A"
        comments_element = post.find('a', class_='comments')
        # Check if the post has comments and get only the number of comments and add it to the dictionary
        post_data['Comments'] = comments_element.text.split()[0] if comments_element else 0
        likes_element = post.find("div", attrs={"class": "score likes"})
        # Check if the post has likes and add it to the dictionary
        if likes_element and likes_element.text == "•":
            post_data['Likes'] = 0
        elif likes_element:
            post_data['Likes'] = likes_element.text
        else:
            post_data['Likes'] = "N/A"
        # Add the post data to the list
        posts_data.append(post_data)

    # Get the button that takes us to the next page
    next_button = soup.find('span', class_='next-button')
    # Get the url of the next page
    url = next_button.find('a').get('href') if next_button else None

    if url:
        # Make the next request
        response = requests.get(url, headers=headers)
        time.sleep(2)  # Wait for 2 seconds before making the next request
    else:
        break

In [21]:
# Write the posts data to a CSV file
with open('reddit_posts_bs4.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['Title', 'Author', 'Likes', 'Comments'])
    writer.writeheader()
    writer.writerows(posts_data)

print(f"Total unique posts collected: {len(posts_data)}")

Total unique posts collected: 794


In [7]:
import pandas as pd
reddit_posts = pd.read_csv('reddit_posts_bs4.csv')
reddit_posts.head()

Unnamed: 0,Title,Author,Likes,Comments
0,Russia Warns Israel Over Flooding Tunnels in G...,whosagoodbi,3793,1015
1,Canada's surging cost of living fuels reverse ...,Square-Simple723,433,216
2,"We only use 100% Canadian Beef, and if you ask...",McDonaldsCanada,0,0
3,TIL Tom Cruise ended his marriages with Mimi R...,Motor-Conclusion-260,3433,373
4,Ontario to announce plan to allow beer in corn...,Jetboater111,364,314


In [13]:
# We are using transformers library to get the sentiment of the posts.
from transformers import pipeline

# The sentiment-analysis pipeline from the transformers library is used to get the sentiment of the posts.
# Since we haven't specified the model, it will use the default model which is DistilBERT.
classifier = pipeline('sentiment-analysis')

# Get the titles of the posts and store them in a list.
reddit_title_posts = reddit_posts['Title'].tolist()

# Create lists to store the posts based on their sentiment.
positive_posts = []
negative_posts = []
neutral_posts = []

# Get the sentiment of the posts and store them in the appropriate list.
results = classifier(reddit_title_posts)

# Loop through the posts and the results and store them in the appropriate list.
for post, result in zip(reddit_title_posts, results):
    score = round(result['score'], 4)
    if 0.4 <= score <= 0.6:
        label = 'NEUTRAL'
        neutral_posts.append((post, score))
    elif result['label'] == 'POSITIVE':
        label = 'POSITIVE'
        positive_posts.append((post, score))
    else:
        label = 'NEGATIVE'
        negative_posts.append((post, score))

print(f"Positive posts: {len(positive_posts)}")
print(f"Negative posts: {len(negative_posts)}")
print(f"Neutral posts: {len(neutral_posts)}")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Positive posts: 190
Negative posts: 798
Neutral posts: 38
