In [9]:
import pandas as pd
import numpy as np
import requests
import time
from bs4 import BeautifulSoup

class NinisiteTopicCrawler():
    """
    A class for crawling and extracting data from a NiniSite topic.
    """
    def __init__(self, topic_id):
        """
        Initializes the NinisiteTopicCrawler instance.

        Args:
            topic_id (int): The ID of the topic to crawl.
        """
        self.topic_id = topic_id
        self.main_link = 'https://www.ninisite.com'
        self.topic_link = self.main_link + '/discussion/topic/' + str(topic_id)
        self.topic_html = requests.get(self.topic_link).text
        self.topic_soup = BeautifulSoup(self.topic_html, 'html.parser')

    def topic_parser(self):
        """
        Parses the main topic page and extracts topic details.
        """
        soup = self.topic_soup.find('article', {'id': 'topic'})
        self.topic_detail = {
            'id': self.topic_id,
            'title': soup.find('div', {'class': 'col-xs-12 m-b-1 p-x-1 forum__topic--header'}).find('a').text,
            'text': soup.find('div', {'class': 'post-message topic-post__message col-xs-12 fr-view m-b-1 p-x-1'}).text,
            'date': soup.find('span', {'class': 'date'}).text,
            'username': soup.find('a', {'class': 'col-xs-9 col-md-12 text-md-center text-xs-right nickname'})['href'].split("/")[3],
            'userid': soup.find('a', {'class': 'col-xs-9 col-md-12 text-md-center text-xs-right nickname'})['href'].split("/")[2],
            'number_comments': int(soup.find_all('span', {'class': 'pull-xs-right'})[-1].text.split()[1])
        }
        self.topic_detail['number_pages'] = int(np.ceil(self.topic_detail['number_comments'] / 20))

    def comment_parser(self, comment):
        """
        Parses a comment element and extracts comment details.

        Args:
            comment (BeautifulSoup.Tag): The BeautifulSoup tag representing a comment.

        Returns:
            dict: Comment details including id, text, date, username, userid, and isReply.
        """
        self.comment = comment
        comment_detail = {
            'id': comment['id'].split('-')[-1],
            'text': comment.find('div', class_='post-message').text.strip(),
            'date': comment.find('span', class_='date').text.strip(),
            'username': comment.find('a', class_='col-xs-9 col-md-12 text-md-center text-xs-right nickname').text.strip(),
            'userid': comment.find('a', class_='col-xs-9 col-md-12 text-md-center text-xs-right nickname')['href'].split('/')[2],
            'isReply': bool(comment.find('div', {'class': 'reply-message'}))
        }
        return comment_detail

    def crawl_comments_in_topic(self):
        """
        Crawls all comments in the topic and returns a DataFrame containing the comment data.

        Returns:
            pd.DataFrame: DataFrame containing the comment data.
        """
        self.topic_parser()
        comments_data = []
        for pg_number in range(1, self.topic_detail['number_pages'] + 1):
            print(pg_number)
            html = requests.get(self.topic_link + f'?page={pg_number}').text
            soup = BeautifulSoup(html, 'html.parser')
            self.comments = soup.find_all('article', class_='topic-post')
            for comment in self.comments[1:]:
                comments_data.append(self.comment_parser(comment))

            time.sleep(1.5)

        return pd.DataFrame(comments_data)
