In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from time import sleep, time
import datetime

In [2]:
# Function to get all the text bodys from the soup
def get_text_bodys_from_soup(soup):
    return soup.find_all('tbody', 
                        # x and x.startswith('normalthread_') means that 
                        # x is not None and starts with 'normalthread_' 
                        attrs={'id': lambda x: x and 
                                        x.startswith('normalthread_') or
                                        x.startswith('stickthread_')})

# Function to get all the comments from the text bodys
def get_comments_from_text_bodys(text_bodys):
    return [text_body.find('a', attrs={'class': 's xst'}).text 
                    for text_body in text_bodys]

# Function to get all the summaries from the text bodys
def get_summaries_from_text_bodys(text_bodys):
    return [text_body.find('a', attrs={'href': lambda x: x and 
                                            x.startswith('forum.php?')}).text
                for text_body in text_bodys]

# Function to get all the comments and summaries from the soup
def get_comments_from_soup(soup):
    text_bodys = get_text_bodys_from_soup(soup)
    comments = get_comments_from_text_bodys(text_bodys)
    summaries = get_summaries_from_text_bodys(text_bodys)
    if len(comments) != len(summaries):
        print('len(comments) != len(summaries)')
        return None
    return comments, summaries

In [None]:
# Function to get all the comments and summaries from the url
def get_comments_summaries_from_page(page_url, page_num):
    response = requests.get(page_url)
    if response.status_code != 200:
        print('page num: ', page_num, ', status code: ', response.status_code)
        print('page_url: ', page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return get_comments_from_soup(soup)

# Function to get all the comments and summaries from all the pages
def get_data_from_pages(base_url, start_page=1, end_page=1000):
    all_comments = []
    all_summaries = []
    for page_num in range(start_page, end_page+1):
        page_url = base_url + str(page_num) + '.html'
        comments, summaries = get_comments_summaries_from_page(page_url, 
                                                                page_num)
        if comments is None:
            break
        all_comments.extend(comments)
        all_summaries.extend(summaries)
        # sleep for 3-10 seconds randomly to avoid being blocked
        sleep(np.random.randint(3, 10))
    return pd.DataFrame({'comment': all_comments, 'summary': all_summaries})

In [None]:
# Function to run the main function
def main():
    # time the running time of the function
    start_time = time()
    base_url = 'fake_urlxxxxxxx.com' # This is a fake url
    df = get_data_from_pages(base_url, start_page=1, end_page=2)
    end_time = time()
    print('Time elapsed: ', end_time - start_time)
    print('Time elapsed: ', str(datetime.timedelta(seconds=end_time-start_time)))
    # save the content to a csv file, overwrite if the file already exists
    df.to_csv('comment_summary.csv', index=False)
    print('Content saved to comment_summary.csv')
    return None

if __name__ == '__main__':
    main()

Time elapsed:  9.689262866973877
Time elapsed:  0:00:09.689263
Content saved to comment_summary.csv
