In [1]:
import requests
from bs4 import BeautifulSoup as bs
from openpyxl import Workbook
from openpyxl.drawing.image import Image
from openpyxl.utils.exceptions import IllegalCharacterError
import urllib.request
import urllib.error
import time

to_scrape = "https://www.taptap.com/topic/3805806"

In [2]:
class TapTapScraper:
    def __init__(self, url):
        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.title = "TapTap"
        self.next_row = 1
        self.row_dict = {1: 1}
        self.url = url

    def parse_page(self, page_num):
        page = self.get_page(page_num)

        # get pages
        if page_num in self.row_dict.keys():
            self.next_row = self.row_dict[page_num]
        while page.status_code != 200:
            time.sleep(1)
            page = self.get_page(page_num)
        page_soup = bs(page.text, 'html.parser')

        commentList = page_soup.find_all(class_="posts-item-text topic-posts-item-text")
        for comment in commentList:
            # parse comment and secondary comments
            parsed_comments = self.parse_primary_comment(comment)
            for parsed_comment in parsed_comments:
                for i in range(len(parsed_comment)):
                    try:
                        self.ws.cell(row=self.next_row, column=(i + 1), value=parsed_comment[i])
                    except IllegalCharacterError:
                        pass
                self.next_row += 1
        self.row_dict[page_num + 1] = self.next_row
        time.sleep(1)

    def parse_primary_comment(self, comment):
        """
        parses primary and secondary comments
        :param comment: 
        :return list of parsed primary and secondary comments: 
        """

        comment_soup = bs(str(comment), 'html.parser')
        parsed_comments = []

        # primary comment
        tap_user_id = str(comment_soup.find_all(class_='taptap-user')[0]).split('"')[3]
        tap_user_name = comment_soup.find_all(class_="taptap-user-name taptap-link")[0].get_text().strip()
        primary_floor_num = int(comment_soup.find_all(class_="pull-right")[0].get_text()[:-1])
        comment_body = comment_soup.find_all(class_="item-text-body bbcode-body js-open-bbcode-image")[0]
        comment_text = comment_body.prettify()
        comment_text = comment_text.replace("<br/>", "")
        comment_text = remove_tags(comment_text).strip()
        parsed_comments.append((tap_user_id, tap_user_name, primary_floor_num, comment_text))

        #secondary comments
        secondary_comment_list = comment.find_all(class_="comment-item-text")
        collapsed_secondary_comment_links = comment.find_all(class_="pagination")
        if not collapsed_secondary_comment_links:
            return parsed_comments
        collapsed_secondary_comment_links = comment.find_all(class_="pagination")[0].find_all("a")  # list of <a> tags
        collapsed_secondary_comment_links = list(map(lambda x: x["href"], collapsed_secondary_comment_links))[
                                            :-1]  # list of links, for some reason there is an extra copy of page 2 at the end
        for link in collapsed_secondary_comment_links:
            response = requests.get(link)
            response_text = response.json()["data"]["html"]
            response_soup = bs(response_text, "html.parser")
            response_comment_list = response_soup.find_all(class_="taptap-comment-item")
            for response_comment in response_comment_list:
                secondary_comment_list.append(response_comment)
        for i, secondary_comment in enumerate(secondary_comment_list):
            parsed_comments.append(
                Scraper.parse_secondary_comment(secondary_comment, str(primary_floor_num) + "_" + str(i + 1)))

        return parsed_comments

    def parse_secondary_comment(self, secondary_comment, floor_num):
        tap_user = secondary_comment.find_all(class_="taptap-user")[0]
        tap_user_id = tap_user["data-user-id"]
        tap_user_name = tap_user.text.strip()
        floor_num = floor_num
        comment_text = secondary_comment.find_all(class_="item-text-body")[0].text.strip()

        return tap_user_id, tap_user_name, floor_num, comment_text


    def save(self, checkpoint=None):
        if not checkpoint:
            self.wb.save('TapTap_Weapon_Description.xlsx')
        else:
            self.wb.save('TapTap_Weapon_Description_%d.xlsx' % checkpoint)

    def get_page(self, page_num):
        url = self.url + "?page=%d#postsList" % page_num
        page = requests.get(url)
        return page

def remove_tags(text):
    while "<" in text:
        start = text.find("<")
        end = text.find(">")
        text = text[:start] + text[end + 1:]
    return text


In [3]:
Scraper = TapTapScraper(to_scrape)

In [4]:
i = 1
while i < 196:
    try:
        Scraper.parse_page(i)
        print("Scraped page %d" % i)
        Scraper.save()
        i += 1
    except urllib.error.HTTPError as e:
        print("HTTPError detected, re-scraping page")
        continue
    except ConnectionError as e:
        time.sleep(20)
        continue
    if i % 30 == 0:
        Scraper.save(i)


Scraped page 1


Scraped page 2


Scraped page 3


Scraped page 4


Scraped page 5


Scraped page 6


Scraped page 7


Scraped page 8


Scraped page 9


Scraped page 10


Scraped page 11


Scraped page 12


Scraped page 13


Scraped page 14


Scraped page 15


Scraped page 16


Scraped page 17


Scraped page 18


Scraped page 19


Scraped page 20


Scraped page 21


Scraped page 22


Scraped page 23


Scraped page 24


Scraped page 25


Scraped page 26


Scraped page 27


Scraped page 28


Scraped page 29


Scraped page 30


Scraped page 31


Scraped page 32


Scraped page 33


Scraped page 34


Scraped page 35


Scraped page 36


Scraped page 37


Scraped page 38


Scraped page 39


Scraped page 40


Scraped page 41


Scraped page 42


Scraped page 43


Scraped page 44


Scraped page 45


Scraped page 46


Scraped page 47


Scraped page 48


Scraped page 49


Scraped page 50


Scraped page 51


Scraped page 52


Scraped page 53


Scraped page 54


Scraped page 55


Scraped page 56


Scraped page 57


Scraped page 58


Scraped page 59


Scraped page 60


Scraped page 61


Scraped page 62


Scraped page 63


Scraped page 64


Scraped page 65


Scraped page 66


Scraped page 67


Scraped page 68


Scraped page 69


Scraped page 70


Scraped page 71


Scraped page 72


Scraped page 73


Scraped page 74


Scraped page 75


Scraped page 76


Scraped page 77


Scraped page 78


Scraped page 79


Scraped page 80


Scraped page 81


Scraped page 82


Scraped page 83


Scraped page 84


Scraped page 85


Scraped page 86


Scraped page 87


Scraped page 88


Scraped page 89


Scraped page 90


Scraped page 91


Scraped page 92


Scraped page 93


Scraped page 94


Scraped page 95


Scraped page 96


Scraped page 97


Scraped page 98


Scraped page 99


Scraped page 100


Scraped page 101


Scraped page 102


Scraped page 103


Scraped page 104


Scraped page 105


Scraped page 106


Scraped page 107


Scraped page 108


Scraped page 109


Scraped page 110


Scraped page 111


Scraped page 112


Scraped page 113


Scraped page 114


Scraped page 115


Scraped page 116


Scraped page 117


Scraped page 118


Scraped page 119


Scraped page 120


Scraped page 121


Scraped page 122


Scraped page 123


Scraped page 124


Scraped page 125


Scraped page 126


Scraped page 127


Scraped page 128


Scraped page 129


Scraped page 130


Scraped page 131


Scraped page 132


Scraped page 133


Scraped page 134


Scraped page 135


Scraped page 136


Scraped page 137


Scraped page 138


Scraped page 139


Scraped page 140


Scraped page 141


Scraped page 142


Scraped page 143


Scraped page 144


Scraped page 145


Scraped page 146


Scraped page 147


Scraped page 148


Scraped page 149


Scraped page 150


Scraped page 151


Scraped page 152


Scraped page 153


Scraped page 154


Scraped page 155


Scraped page 156


Scraped page 157


Scraped page 158


Scraped page 159


Scraped page 160


Scraped page 161


Scraped page 162


Scraped page 163


Scraped page 164


Scraped page 165


Scraped page 166


Scraped page 167


Scraped page 168


Scraped page 169


Scraped page 170


Scraped page 171


Scraped page 172


Scraped page 173


Scraped page 174


Scraped page 175


Scraped page 176


Scraped page 177


Scraped page 178


Scraped page 179


Scraped page 180


Scraped page 181


Scraped page 182


Scraped page 183


Scraped page 184


Scraped page 185


Scraped page 186


Scraped page 187


Scraped page 188


Scraped page 189


Scraped page 190


Scraped page 191


Scraped page 192


Scraped page 193


Scraped page 194


Scraped page 195


In [88]:
page = Scraper.get_page(1)
page_soup = bs(page.text, "html.parser")
# 一级回复
commentList = page_soup.find_all(class_="posts-item-text topic-posts-item-text")
comment = commentList[16]


In [57]:
# 二级回复
secondary_comment_list = comment.find_all(class_="comment-item-text")
collapsed_secondary_comment_links = comment.find_all(class_="pagination")[0].find_all("a")  # list of <a> tags
collapsed_secondary_comment_links = list(map(lambda x: x["href"], collapsed_secondary_comment_links))[
                                    :-1]  # list of links, for some reason there is an extra copy of page 2 at the end
for link in collapsed_secondary_comment_links:
    response = requests.get(link)
    response_text = response.json()["data"]["html"]
    response_soup = bs(response_text,"html.parser")
    response_comment_list = response_soup.find_all(class_="taptap-comment-item")
    for response_comment in response_comment_list:
        secondary_comment_list.append(response_comment)
parsed_secondary_comments = []
for i, secondary_comment in enumerate(secondary_comment_list):
    parsed_secondary_comments.append(Scraper.parse_secondary_comment(secondary_comment, primary_floor_num + str(i+1)))
parsed_secondary_comments

[('16078861', '手机用户16078861', 0, 'em没有第二拳'),
 ('27858300', '真-无聊', 1, '2.一拳:      第一滴血'),
 ('2895597', 'AriTaliao', 2, 'AriTaliao2.一拳    那么..你变秃了嘛?'),
 ('19246', '迪拉达( ¨̮ )', 3, '没有什么是一拳解决不了的，如果有，我也没办法┐（─__─）┌'),
 ('30392201', '黄家欣', 4, '一拳秒杀？  吊炸天BOSS你秒给我看'),
 ('26634634', '雷电萨满杨永信', 5, '一拳：玩胖次！'),
 ('30392814', 'epic sans', 6, '一个光头！'),
 ('4831460', 'Be Alright', 7, '拿小拳拳锤你胸口'),
 ('27177804', 'pencake', 8, '借楼'),
 ('30396492', '手机用户30396492', 9, '我这一拳下来你可能会死'),
 ('18434247', '一点星源吧', 10, '一拳：我变秃了也变强了'),
 ('9791115', 'QQ用户9791115', 11, '只有光头才能使用的武器（手动滑稽）'),
 ('30408645', '小房子', 12, '一拳：闪瞎你的24k纯金狗眼'),
 ('2293495', '我', 13, '一拳：我这一拳下去你可能会死'),
 ('30418258', 'Jesses```Steve`', 14, '浮游炮:我很飘也很吊'),
 ('24297030', '梦逖安', 15, '打什么都一拳'),
 ('1603253', '武禁', 16, '秃头披风侠使用的武器'),
 ('9047436', 'toby papyrus', 17, '一拳   别摘我假发，我怕闪到你'),
 ('30668042', '㏒', 18, '无需多言，只需一拳。。'),
 ('25881571', '🎖🎲Jerry🎲🎖', 19, 'Jerry2.一拳 小拳拳锤爆你胸口')]

In [42]:
# 折叠二级回复
collapsed_secondary_comment_links = comment.find_all(class_="pagination")[0].find_all("a")  # list of <a> tags
collapsed_secondary_comment_links = list(map(lambda x: x["href"], collapsed_secondary_comment_links))[
                                    :-1]  # list of links, for some reason there is an extra copy of page 2 at the end
collapsed_secondary_comment_links


['https://www.taptap.com/ajax/post/comments?id=29805385&page=2',
 'https://www.taptap.com/ajax/post/comments?id=29805385&page=3',
 'https://www.taptap.com/ajax/post/comments?id=29805385&page=4']

('18434247', '一点星源吧', None, '一拳：我变秃了也变强了')