### Getting Reddit Posts using BeautifulSoup

In [10]:
import requests
import csv
import time
from bs4 import BeautifulSoup

Specify the url for the webpage that we want to scrape. we're using [Reddit's old website](https://old.reddit.com) for simplicity. Before we start writing the script, first we take a look to the website and see its structure 

### Getting the Page with BeautifulSoup

In [11]:
# First, we need to request the web page using the ‘requests’ library.
url = "https://old.reddit.com"

# Headers to simulate a browser visit.
headers = {'User-Agent': 'Mozilla/5.0'}

# Returns a response object which contains the entire source code of the HTML file.
response = requests.get(url, headers=headers)

### Finding the tags

In [12]:
# Create a list to store the posts
posts_data = []

desired_number_of_posts = 20000  # or any number you want

# Loop through the pages
while True:
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all the posts in the page using the class name 'thing' which is the class name of the div tag that contains the posts.
    posts = soup.find_all('div', class_='thing')

    for post in posts:
        # Create a dictionary to store the post data
        post_data = {}
        # Get the title element of the post
        title_element = post.find('p', class_="title")
        # Check if the post has a title and add it to the dictionary
        post_data['Title'] = title_element.text if title_element else "N/A"
        author_element = post.find('a', class_='author')
        # Check if the post has an author and add it to the dictionary
        post_data['Author'] = author_element.text if author_element else "N/A"
        comments_element = post.find('a', class_='comments')
        # Check if the post has comments and get only the number of comments and add it to the dictionary
        post_data['Comments'] = comments_element.text.split()[0] if comments_element else 0
        likes_element = post.find("div", attrs={"class": "score likes"})
        # Check if the post has likes and add it to the dictionary
        if likes_element and likes_element.text == "•":
            post_data['Likes'] = 0
        elif likes_element:
            post_data['Likes'] = likes_element.text
        else:
            post_data['Likes'] = "N/A"
        # Add the post data to the list
        posts_data.append(post_data)

    # Get the button that takes us to the next page
    next_button = soup.find('span', class_='next-button')
    # Get the url of the next page
    url = next_button.find('a').get('href') if next_button else None

    if url and len(posts_data) < desired_number_of_posts:
        # Make the next request
        response = requests.get(url, headers=headers)
        time.sleep(2)  # Wait for 2 seconds before making the next request
    else:
        break

In [13]:
# Write the posts data to a CSV file with UTF-8 encoding
with open('new_reddit_posts_bs4.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['Title', 'Author', 'Likes', 'Comments'])
    writer.writeheader()
    writer.writerows(posts_data)

print(f"Total unique posts collected: {len(posts_data)}")


Total unique posts collected: 866


In [15]:
import pandas as pd
reddit_posts = pd.read_csv('new_reddit_posts_bs4.csv')
reddit_posts.head()

Unnamed: 0,Title,Author,Likes,Comments
0,"Free hotel rooms, meals for refugee applicants...",FancyNewMe,1284,722
1,"Between work, life and the health of your love...",LifeLabsLP,0,0
2,"What's the most ""small town"" thing you've witn...",official_biz,1722,2407
3,[Passan] Shohei Ohtani just posted on Instagra...,mattybabs,760,1045
4,[Passan] Shohei Ohtani's deal with the Dodgers...,TheTurtleShepard,12.8k,4193
