## 一、每个英雄的链接采集

In [1]:
import requests
res = requests.get('https://pvp.qq.com/web201605/js/herolist.json')
herolist_json = res.json()
for i in range(len(herolist_json)):
    herolist_json[i]['ename'] = 'https://pvp.qq.com/web201605/herodetail/{}.shtml'.format(herolist_json[i]['ename'])

herolist_json[0]

{'ename': 'https://pvp.qq.com/web201605/herodetail/105.shtml',
 'cname': '廉颇',
 'title': '正义爆轰',
 'new_type': 0,
 'hero_type': 3,
 'skin_name': '正义爆轰|地狱岩魂',
 'moss_id': 3627}

|herodetail-sort|类型|
|:-:|:-:|
|1|战士|
|2|法师|
|3|坦克|
|4|刺客|
|5|射手|
|6|辅助|

## 存入 MongoDB 

In [39]:
import pymongo
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['HonorOfKings']     # 指定数据库
collection = db['herolist']     # 指定集合
collection.insert_many(herolist_json)   # 插入文档

<pymongo.results.InsertManyResult at 0x1ecc59a8460>

<div align=center>
<img alt="图 1" src="../images/2b51d0a655a9a13bf8bf78c88a38bb75babffa4986031ba2ec4e25d0bd0153c0.png" width=75%/>  
</div>

## 采集每个英雄的详细信息

> 说明

1. **`lxml & XPath`** 采集的数据包括
 - 英雄基本信息
 - 技能介绍
 - 技能加点建议
 - 英雄关系

2. **`Selenium & XPath`** 采集的数据包括

    该部分内容被HTML注释，且`注释内容`与`页面展示（即肉眼所见内容）`不同
- 铭文搭配建议
- 出装建议

### 准备工作

In [426]:
### 导库、读取数据库
import re
import pymongo
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ChromeOptions


client = pymongo.MongoClient(host='localhost', port=27017)
db = client['HonorOfKings']
collection = db['herolist']

results = collection.find({})

In [435]:
### ✅英雄基本信息
### ✅技能介绍
### ✅技能加点建议
### ✅英雄关系

class LxmlMode:
    def __init__(self, url):
        res = requests.get(url)
        res.encoding = 'gbk'
        self.html = etree.HTML(res.text)

    ### ✅英雄基本信息
    '''
        返回一个字典
    '''
    def get_cover_info(self):
        # 英雄基本信息
        cover      = self.html.xpath('//div[@class="cover"]')[0]
        
        cover_name = cover.xpath('./h2/text()')[0]      # 英雄名
        herodetail_sort = cover.xpath('./span/i/@class')[0]     # 英雄类型
        cover_list = []
        for li in cover.xpath('.//ul/li'):
            cover_list_name = li.xpath('./em/text()')[0]
            cover_list_bar  = li.xpath('./span/i/@style')[0]
            cover_list.append({'cover_list_name':cover_list_name, 'cover_list_bar':cover_list_bar})
        cover_info = {
            '英雄名称':cover_name, 
            '英雄类型':herodetail_sort[-1:], 
            '英雄基础信息':cover_list
            }
        
        return cover_info

    ### ✅技能介绍
    '''
        返回一个列表，列表内容包括每一个技能的介绍
    '''
    def get_skill_info_details(self):
        # 技能介绍
        skill_info = self.html.xpath('//div[contains(@class, "skill-show")]')[0]
        
        skill_info_details = []
        for div in skill_info.xpath('./div'):
            if len(div.xpath('./p[1]/b/text()')) == 0:
                pass
            else:
                skill_name = div.xpath('./p[1]/b/text()')[0]            # 技能名
                skill_time = div.xpath('./p[1]/span[1]/text()')[0][4:].split('/')  # 技能冷却值
                skill_need = div.xpath('./p[1]/span[2]/text()')[0][3:]  # 技能消耗
                skill_desc = div.xpath('./p[2]/text()')                 # 技能描述
                skill_info_details.append({
                    'skill_name' : skill_name,
                    'skill_time' : skill_time,
                    'skill_need' : skill_need,
                    'skill_desc' : skill_desc,
                })

        return skill_info_details

    ### ✅技能加点建议
    '''
        返回一个字典
    '''
    def get_skill_upgrade_sugg(self):
        sugg_info2 = self.html.xpath('//div[contains(@class, "sugg-info2")]/p[contains(@class, "sugg-name")]')

        major_skill_sugg = sugg_info2[0].xpath('./span/text()')[0]
        minor_skill_sugg = sugg_info2[1].xpath('./span/text()')[0]
        hero_skill_sugg  = sugg_info2[2].xpath('./span/text()')[0].split('/')

        skill_upgrade_sugg = {
            '主升' : major_skill_sugg,
            '副升' : minor_skill_sugg,
            '召唤师技能' : hero_skill_sugg,
        }

        return skill_upgrade_sugg


    ### ✅英雄关系
    '''
        返回一个字典【多层】 
    '''
    def get_hero_relationship(self):
        hero_info_box = self.html.xpath('//div[@class="hero-info-box"]/div/div')

        def hero_info(xpath_):
            ids = [i.split('.')[0] for i in xpath_.xpath('./div[2]/ul/li/a/@href')]
            tips = xpath_.xpath('./div[3]/p/text()')
            
            relationship_dict = []
            for item in list(zip(ids, tips)):
                relationship_dict.append({
                    'id':item[0],
                    'tip':item[1]
                })
            return relationship_dict
        
        hero_relationship = {}
        for i in range(len(hero_info_box)):
            if i == 0:
                hero_relationship['最佳搭档'] = hero_info(xpath_=hero_info_box[i])
            elif i == 1:
                hero_relationship['压制英雄'] = hero_info(xpath_=hero_info_box[i])
            else:
                hero_relationship['被压制英雄'] = hero_info(xpath_=hero_info_box[i])
        
        return hero_relationship


In [433]:
### ✅出装建议
### ✅铭文搭配建议

class SeleniumMode:
    def __init__(self, url):
        option = ChromeOptions()
        option.add_argument('--headless')
        self.browser = webdriver.Chrome(options = option)
        self.browser.get(url)

    ## 出装建议【✅】
    '''
        返回一个元组，包含：
        1. (推荐出装一, Tips)
        2. (推荐出装二, Tips)
    '''
    def get_equip_sugg(self):
        equip_suggs  = self.browser.find_elements(By.CSS_SELECTOR, '.equip-bd > div')
        equip_1_info = []
        equip_2_info = []
        i = 0
        for equip_sugg in equip_suggs:
            i += 1
            tips = equip_sugg.find_element(By.XPATH, './p').text
            for equip in equip_sugg.find_elements(By.XPATH, './ul/li/a/div'):
                name       = equip.find_element(By.XPATH, './div[1]/div/h4').get_attribute('textContent')
                sale_price = int(equip.find_element(By.XPATH, './div[1]/div/p[1]').get_attribute('textContent')[3:])
                price      = int(equip.find_element(By.XPATH, './div[1]/div/p[2]').get_attribute('textContent')[3:])
                features   = equip.find_element(By.XPATH, './div[2]/p[1]').get_attribute('textContent')
                desc       = equip.find_element(By.XPATH, './div[2]/p[2]').get_attribute('textContent')
                equip_info_dict = {
                    '装备名' : name,
                    '售价' : sale_price,
                    '总价' : price,
                    '特性' : features,
                    '描述/被动' : desc,
                }
                if i == 1:
                    equip_1_info.append(equip_info_dict)
                    equip_1_tips = tips
                else:
                    equip_2_info.append(equip_info_dict)
                    equip_2_tips = tips

        return ((equip_1_info, equip_1_tips), (equip_2_info, equip_2_tips))


    ## 铭文搭配建议【✅】
    '''
        返回一个元组，包含：
        1. 铭文信息
        2. 铭文搭配Tips
    '''
    def get_ming_sugg(self):
        sugg_info = self.browser.find_element(By.CSS_SELECTOR, '.sugg-info.info')

        sugg_ming_tips = sugg_info.find_element(By.XPATH, './p').text
        lis = sugg_info.find_elements(By.XPATH, './ul/li')
        sugg_ming = []
        for li in lis:
            name = li.find_element(By.XPATH, './p[1]/em').text
            physical_attack = li.find_element(By.XPATH, './p[2]').text
            physical_penetration = li.find_element(By.XPATH, './p[3]').text
            ming_sugg_dict = {
                '铭文名称' : name,
                '属性一' : physical_attack,
                '属性二' : physical_penetration,
            }
            sugg_ming.append(ming_sugg_dict)
        
        return (sugg_ming, sugg_ming_tips)


In [434]:
### 单个URL测试

url = 'https://pvp.qq.com/web201605/herodetail/105.shtml'

# LxmlMode(url).get_cover_info()            # ✅英雄基本信息
# LxmlMode(url).get_skill_info_details()    # ✅技能介绍
# LxmlMode(url).get_skill_upgrade_suggs()   # ✅技能加点建议
# LxmlMode(url).get_hero_relationship()     # ✅英雄关系

# SeleniumMode(url).get_equip_sugg()    # 出装建议【✅】
# SeleniumMode(url).get_ming_sugg()     # 铭文搭配建议【✅】

### 正式采集

In [430]:
### 遍历每一个英雄的 URL
import pymongo
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['HonorOfkings']
collection_info = db['heroinfo']    # 采集到的每个英雄的信息存入此集合
collection_list = db['herolist']    # 从该集合中读取英雄url进而采集每个英雄的信息

results = collection.find({})

i = 0
for result in results:
    i += 1
    name = result['cname']
    url = result['ename']

    hero_basal_info = LxmlMode(url).get_cover_info()            # ✅英雄基本信息
    LxmlMode(url).get_skill_info_details()    # ✅技能介绍
    LxmlMode(url).get_skill_upgrade_suggs()   # ✅技能加点建议
    LxmlMode(url).get_hero_relationship()     # ✅英雄关系

    if i == 10:
        break

https://pvp.qq.com/web201605/herodetail/108.shtml
https://pvp.qq.com/web201605/herodetail/109.shtml
https://pvp.qq.com/web201605/herodetail/110.shtml
https://pvp.qq.com/web201605/herodetail/111.shtml
https://pvp.qq.com/web201605/herodetail/112.shtml
https://pvp.qq.com/web201605/herodetail/113.shtml
https://pvp.qq.com/web201605/herodetail/114.shtml
https://pvp.qq.com/web201605/herodetail/115.shtml
https://pvp.qq.com/web201605/herodetail/116.shtml
https://pvp.qq.com/web201605/herodetail/117.shtml
https://pvp.qq.com/web201605/herodetail/118.shtml
https://pvp.qq.com/web201605/herodetail/119.shtml
https://pvp.qq.com/web201605/herodetail/120.shtml
https://pvp.qq.com/web201605/herodetail/121.shtml
https://pvp.qq.com/web201605/herodetail/123.shtml
https://pvp.qq.com/web201605/herodetail/124.shtml
https://pvp.qq.com/web201605/herodetail/126.shtml
https://pvp.qq.com/web201605/herodetail/127.shtml
https://pvp.qq.com/web201605/herodetail/128.shtml
https://pvp.qq.com/web201605/herodetail/129.shtml
