In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time

class NiniSiteForumCrawler:
    """
    A web crawler for NiniSite forums.
    """

    def __init__(self, forum_number):
        """
        Initializes the NiniSiteForumCrawler instance.

        Args:
            forum_number (int): The number of the forum to crawl.
        """
        self.main_link = "https://www.ninisite.com"
        self.forum_link = self.main_link + '/discussion/forum/' + str(forum_number)
        self.initial_response = requests.get(self.forum_link)
        
    def get_number_of_pages_in_forum(self):
        """
        Retrieves the number of pages in the forum.

        Returns:
            int: The number of pages in the forum.
        """
        response = self.initial_response
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            self.number_of_pages = int(soup.find_all('div', {'class': 'text-xs-center text-sm-left'})[0].find_all('li')[-2].text)
        return self.number_of_pages
    
    def topic_header_parser(self, topic):
        """
        Parses the topic header and returns a dictionary containing relevant information.

        Args:
            topic (bs4.element.Tag): The HTML tag representing a topic header.

        Returns:
            dict: Parsed topic information.
        """
        parsed_topic = {
            'subject': topic.find('span', {'class': 'topic_subject'}).text,
            'link': self.main_link + topic.find('a')['href'],
            'creator_username': topic.find('div', {'class': 'col-xs-12 p-x-0 pull-xs-right last-topic-user hidden-sm-down'}).text.split()[-1],
            'creator_userid': topic.find('div', {'class': 'col-xs-12 p-x-0 pull-xs-right last-topic-user hidden-sm-down'}).find('a')['href'].split('/')[2],
            'number_of_replies': int(topic.find('span', {'class': 'topic_number hidden-sm-up'}).text.split()[0])
        }
        return parsed_topic
    
    def crawl_topics_in_forum(self, starting_page=1, number_pages=10):
        """
        Crawls topics in the forum and returns a DataFrame with the topic data.

        Args:
            starting_page (int): The starting page number to crawl (default: 1).
            number_pages (int): The number of pages to crawl (default: 10).

        Returns:
            pandas.DataFrame: A DataFrame containing the crawled topic data.
        """
        self.topics_data = []
        with tqdm(total=30*len(range(int(starting_page), int(number_pages)))) as pbar:
            for pg_number in range(starting_page, number_pages):
                html = requests.get(self.forum_link + f'?page={pg_number}').text
                soup = BeautifulSoup(html, 'html.parser')
                topics = soup.find_all('div', {'class': 'col-xs-12 category--header p-x-0'})
                for topic in topics:
                    self.topics_data.append(self.topic_header_parser(topic))
                    pbar.update(1)
                    time.sleep(.05) 
                
                
        topics_df = pd.DataFrame(self.topics_data)
        topics_df['topic_id'] = topics_df.link.apply(lambda x:int(x.split('/')[5]))
        return topics_df

In [12]:
crawler = NiniSiteForumCrawler(141)
topics = crawler.crawl_topics_in_forum(starting_page=5,number_pages=8)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:08<00:00, 10.12it/s]
