### Getting Reddit Posts using BeautifulSoup

In [1]:
import requests
import csv
import time
from bs4 import BeautifulSoup

Specify the url for the webpage that we want to scrape. we're using [Reddit's old website](https://old.reddit.com) for simplicity. Before we start writing the script, first we take a look to the website and see its structure 

### Getting the Page with BeautifulSoup

In [2]:
# First, we need to request the web page using the ‘requests’ library.
url = "https://old.reddit.com"

# Headers to simulate a browser visit.
headers = {'User-Agent': 'Mozilla/5.0'}

# Returns a response object which contains the entire source code of the HTML file.
response = requests.get(url, headers=headers)

### Finding the tags

In [3]:
# Parse the page content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the posts in the page using the class name 'thing' which is the class name of the div tag that contains the posts.
posts = soup.find_all('div', class_='thing')

In [196]:
# Create a list to store the posts
posts_data = []

while len(posts_data) < 1000: # Get 1000 posts
    for post in posts:
        # Create a dictionary to store the post data
        post_data = {}
        # Get the title element of the post
        title_element = post.find('p', class_="title")
        # Check if the post has a title and add it to the dictionary
        post_data['Title'] = title_element.text if title_element else "N/A"

        author_element = post.find('a', class_='author')
        # Check if the post has an author and add it to the dictionary
        post_data['Author'] = author_element.text if author_element else "N/A"

        comments_element = post.find('a', class_='comments')
        # Check if the post has comments and get only the number of comments and add it to the dictionary
        post_data['Comments'] = comments_element.text.split()[0] if comments_element else 0

        likes_element = post.find("div", attrs={"class": "score likes"})
        # Check if the post has likes and add it to the dictionary
        if likes_element and likes_element.text == "•":
            post_data['Likes'] = 0
        elif likes_element:
            post_data['Likes'] = likes_element.text
        else:
            post_data['Likes'] = "N/A"

        # Add the post data to the list
        posts_data.append(post_data)

    # Get the button that takes us to the next page
    next_button = soup.find('span', class_='next-button')
    # Get the url of the next page
    url = next_button.find('a').get('href') if next_button else None

    if not url:
        break

    time.sleep(2)  # Wait for 2 seconds before making the next request

In [197]:
# Write the posts data to a CSV file
with open('reddit_posts_bs4.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['Title', 'Author', 'Likes', 'Comments'])
    writer.writeheader()
    writer.writerows(posts_data)

print(len(posts_data))

1026


In [2]:
import pandas as pd

In [4]:
reddit_posts = pd.read_csv('reddit_posts_bs4.csv')
reddit_posts.head()

Unnamed: 0,Title,Author,Likes,Comments
0,"Henry Kissinger, secretary of state to Richard...",MrRedXiii,31.5k,2930
1,Space Karen during interview yesterday where h...,ohnoh18,3697,991
2,One of the best plot twists you’ll read (v.red...,Remarkable_Toe2603,12.0k,333
3,My boss wanted me to make an infographic. How’...,RedditforBusiness,0,0
4,What something that’s completely normal in mov...,covalentcookies,925,1647


In [17]:
# We are using transformers library to get the sentiment of the posts.
from transformers import pipeline

# Create a sentiment analysis pipeline
classifier = pipeline('sentiment-analysis')

results = classifier("We hope you don't hate it.")

score = results[0]['score']
if 0.4 <= score <= 0.6:
    print("The post is neutral")


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


The post is neutral


In [20]:
positive_posts = []
negative_posts = []
neutral_posts = []

results = classifier(reddit_posts['Title'].tolist())

for post, result in zip(reddit_posts['Title'].tolist(), results):
    score = round(result['score'], 4)
    if 0.4 <= score <= 0.6:
        label = 'NEUTRAL'
        neutral_posts.append((post, score))
    elif result['label'] == 'POSITIVE':
        label = 'POSITIVE'
        positive_posts.append((post, score))
    else:
        label = 'NEGATIVE'
        negative_posts.append((post, score))

print(f"Positive posts: {len(positive_posts)}")
print(f"Negative posts: {len(negative_posts)}")
print(f"Neutral posts: {len(neutral_posts)}")

Positive posts: 114
Negative posts: 912
Neutral posts: 0
