# 前置准备

#### 导库

In [107]:
import re
import asyncio
import aiohttp
import time
import random
import requests
import pymongo
from lxml import etree
from parsel import Selector

#### MongoDB 连接

In [12]:
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['indianhospital']

province_col          = db['province']
hospital_col          = db['hospital']
all_page_col          = db['all_page']
hospital_detail_info  = db['hospital_detail_info']

# error_province_col = db['error_province']
# error_hospital_col = db['error_hospital']

#### 请求头 & 随机代理

In [10]:
headers = {
    'sec-ch-ua' : '"Google Chrome";v="105", "Not)A;Brand";v="8", "Chromium";v="105"',
    'sec-ch-ua-mobile'  : '?0',
    'sec-ch-ua-platform': '"Windows"',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

def random_proxy():
    proxy = 'http://' + requests.get('http://127.0.0.1:5555/random').text.strip()
    return proxy

def random_proxies():
    proxies = {
        'http' : 'http://' + requests.get('http://127.0.0.1:5555/random').text.strip()
    }
    return proxies
random_proxy(), random_proxies()

('http://114.116.2.116:8001', {'http': 'http://114.116.2.116:8001'})

# 正式采集

#### 获取 37 个省的信息并存入数据库

- 包含：名字、链接

In [22]:
"""
一、获取 37 个省份的链接
返回一个包含 37 个省份链接的列表
"""
origin_url = 'https://www.medindia.net/patients/hospital_search/hospital_list.asp?utm_source=topnavigation&utm_medium=desktop&utm_content=&utm_campaign=medindia'
try:
    origin_res = requests.get(
        url     = origin_url,
        headers = headers,
        proxies = random_proxies()
    )
    selector = Selector(text = origin_res.text)

    for div in selector.xpath('//ul[@class="list-inline"]/div/div'):
        province_name = div.xpath('./li/a/span/text()').get().title()   # 所有单词首字母大写
        province_href = div.xpath('./li/a/@href').get()
        if province_col.find_one({'province_name':province_name}) != None:
            continue
        else:
            province_col.insert_one({
                'province_name'  :province_name,
                'province_href'  :province_href,
                'province_status':None
            })
    print('【SUCCEED -- CRAWL PROVINCE SUCCEED】')
except:
    print('【ERROR   -- CRAWL PROVINCE FAILED 】')

【SUCCEED -- CRAWL PROVINCE SUCCEED】


#### 采集医院包含的所有页

In [25]:
province_status_list = []

for province in province_col.find():
    province_name   = province['province_name']
    province_href   = province['province_href']
    province_status = province['province_status']
    # 先判断是否已经采集过某省的医院了
    if province_status == 1:    # 如果 1        跳过该省
        continue
    else:                       # 如果 0 / None 采集该省的页
        try:
            selector = Selector(text = requests.get(province_href, headers=headers, proxies=random_proxies()).text)
            records  = int(selector.xpath('//p[@class="searchtext"][contains(text(), "Total Records")]/b[1]/text()').get())
            page     = int(selector.xpath('//p[@class="searchtext"][contains(text(), "Total Records")]/b[last()]/text()').get())
            if records <= 25:   # 如果该省总医院记录（records）小于 25，则该省总页数只有 1 页
                total_page_of_province = 1
            else:
                total_page_of_province = page
            total_page_href_of_province = [province_href[:-4] + f'-{str(i)}' + province_href[-4:] for i in range(1, total_page_of_province+1)]
            for href in total_page_href_of_province:
                all_page_col.insert_one({
                    'province_name'          : province_name,
                    'page_of_province'       : href,
                    'page_of_province_status': None,
                })
            province_status_list.append({'province_name':province_name,'province_new_status':1})
        except Exception as e:
            print('错误类型是',e.__class__.__name__)
            print('错误明细是',e)
            province_status_list.append({'province_name':province_name,'province_new_status':0})

# 统一 province_col 数据库的状态
for province in province_status_list:
    condition        = {'province_name':province['province_name']}
    update_status    = province_col.find_one(condition)
    update_status['province_status']  = province['province_new_status']
    province_col.update_one(condition,{'$set': update_status})

#### 采集每页的医院链接

In [109]:
for province in province_col.find()[0:12]:
    province_name = province['province_name']
    if province['province_status'] != 1:    
        # 如果状态为 0 / Null —— 说明没有这个省的数据（页），就不采集其医院信息
        continue
    else:           
        # 省的状态为 1 —— 采集该省的信息
        print(f'|【PROVINCE】——Start Province {province_name}')
        for page in all_page_col.find({'province_name':province_name}):
            page_of_province        = page['page_of_province']
            page_of_province_status = page['page_of_province_status']
            
            # 更新用
            condition   = {'page_of_province':page_of_province}
            update_page = all_page_col.find_one(condition)

            if page_of_province_status == 1:    # 该页的状态为 1 —— 说明已经采集过了，跳过不采集
                continue
            else:                               # 该页的状态为 0 / None —— 采集
                print('|----|【PAGE】——Start Page', re.search('hospital_search/(.*?).htm', page_of_province).group(1).title())
                try:
                    page_res = requests.get(url=page_of_province, headers=headers, proxies=random_proxies(),)
                    selector = Selector(text = page_res.text)
                    hospital_list = selector.xpath('//h3[@class="vert-small-margin"]/a')
                    for hospital in hospital_list:
                        hospital_belong_province = province_name
                        hospital_name   = hospital.xpath('./text()').re_first('(\w+.*?),').title()
                        hospital_href   = hospital.xpath('./@href').get()
                        hospital_status = None
                        hospital_col.insert_one({
                            'hospital_belong_province':hospital_belong_province,
                            'hospital_name'  :hospital_name,
                            'hospital_href'  :hospital_href,
                            'hospital_status':hospital_status,
                        })
                    # 更新页状态
                    update_page['page_of_province_status'] = 1
                    all_page_col.update_one(condition, {'$set':update_page})
                    # print(f'|----|----|【SCRAW PAGE SUCCEED】{page_of_province}')
                except:
                    # 更新页状态
                    update_page['page_of_province_status'] = 0
                    all_page_col.update_one(condition, {'$set':update_page})
                    print(f'|----|----|⭕【SCRAW PAGE ERROR】{page_of_province}')
                    continue
        print(f'|✅ {province_name}')

|【PROVINCE】——Start Province Andaman And Nicobar Islands
|✅ Andaman And Nicobar Islands
|【PROVINCE】——Start Province Andhra Pradesh
|✅ Andhra Pradesh
|【PROVINCE】——Start Province Arunachal Pradesh
|✅ Arunachal Pradesh
|【PROVINCE】——Start Province Assam
|✅ Assam
|【PROVINCE】——Start Province Bihar
|✅ Bihar
|【PROVINCE】——Start Province Chandigarh
|✅ Chandigarh
|【PROVINCE】——Start Province Chhattisgarh
|✅ Chhattisgarh
|【PROVINCE】——Start Province Dadra And Nagar Haveli
|✅ Dadra And Nagar Haveli
|【PROVINCE】——Start Province Daman And Diu
|✅ Daman And Diu
|【PROVINCE】——Start Province Delhi
|✅ Delhi


#### 采集医院的详细信息

In [None]:
hospital_name = selector.xpath('//div[@class="mi-bg-1"]/../h2/text()').re_first('Address of (.*)')
hospital_address       = ', '.join(selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p[1]//text()').re('\s*(\w.*\w)\s*,*'))
hospital_director      = selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p/b[contains(text(), "Director")]/../text()').re_first('\s*(\w.*\w)\s*')
hospital_email         = selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p/b[contains(text(), "Email")]/../span/text()').re_first('\s*(\w.*\w)\s*')
hospital_phone         = selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p/b[contains(text(), "Phone")]/../span/text()').re_first('\s*(\w.*\w)\s*')
hospital_mobile        = selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p/b[contains(text(), "Mobile")]/../span/text()').re_first('\s*(\w.*\s)\s*')

hospital_detail_info = {
    'hospital_name'    : hospital_name,
    'hospital_address' : hospital_address,
    'hospital_director': hospital_director,
    'hospital_email'   : hospital_email,
    'hospital_phone'   : hospital_phone,
    'hospital_mobile'  : hospital_mobile,
    'hospital_href' : None,
    'hospital_html' : None,
}

In [95]:
for hospital in hospital_col.find()[0:10]:
    print(hospital)
    # hospital_res = requests.get(
    #     url     = 
    #     headers = headers
    #     proxies = random_proxies()
    # )

{'_id': ObjectId('632c0acfd46a05f1bf70282c'), 'hospital_belong_province': 'Andaman And Nicobar Islands', 'hospital_name': 'Chakraborty Hospital', 'hospital_href': 'https://www.medindia.net/patients/hospital_search/chakraborty-hospital-port-blair-andaman-nicobar-95591-1.htm', 'hospital_status': None}
{'_id': ObjectId('632c0acfd46a05f1bf70282d'), 'hospital_belong_province': 'Andaman And Nicobar Islands', 'hospital_name': 'Chakraborty Multi Speciality Hospital', 'hospital_href': 'https://www.medindia.net/patients/hospital_search/chakraborty-multi-speciality-hospital-south-andaman-andaman-and-nicobar-islands-14966-1.htm', 'hospital_status': None}
{'_id': ObjectId('632c0acfd46a05f1bf70282e'), 'hospital_belong_province': 'Andaman And Nicobar Islands', 'hospital_name': "Dr. Agarwal'S Eye Hospital", 'hospital_href': 'https://www.medindia.net/patients/hospital_search/dragarwals-eye-hospital-port-blair-andaman-nicobar-68192-1.htm', 'hospital_status': None}
{'_id': ObjectId('632c0acfd46a05f1bf702