# 前置准备

#### 导库

In [8]:
import re
import asyncio
import traceback
import aiohttp
import time
import random
import requests
import pymongo
from lxml import etree
from parsel import Selector

#### MongoDB 连接

In [9]:
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['indianhospital']

province_col          = db['province']
hospital_col          = db['hospital']
all_page_col          = db['all_page']
hospital_detail_info  = db['hospital_detail_info']

# error_province_col = db['error_province']
# error_hospital_col = db['error_hospital']

#### 请求头 & 随机代理

In [10]:
# headers = {
#     'sec-ch-ua' : '"Google Chrome";v="105", "Not)A;Brand";v="8", "Chromium";v="105"',
#     'sec-ch-ua-mobile'  : '?0',
#     'sec-ch-ua-platform': '"Windows"',
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
# }

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

def random_proxy():
    proxy = 'http://' + requests.get('http://127.0.0.1:5555/random').text.strip()
    return proxy

def random_proxies():
    proxies = {
        'http' : 'http://' + requests.get('http://127.0.0.1:5555/random').text.strip()
    }
    return proxies
random_proxy(), random_proxies()

('http://89.163.210.203:3128', {'http': 'http://89.208.219.121:8080'})

# 正式采集

#### 获取 37 个省的信息

- 包含：名字、链接

In [11]:
"""
一、获取 37 个省份的链接
返回一个包含 37 个省份链接的列表
"""
origin_url = 'https://www.medindia.net/patients/hospital_search/hospital_list.asp?utm_source=topnavigation&utm_medium=desktop&utm_content=&utm_campaign=medindia'
try:
    origin_res = requests.get(
        url     = origin_url,
        headers = headers,
        proxies = random_proxies()
    )
    selector = Selector(text = origin_res.text)

    for div in selector.xpath('//ul[@class="list-inline"]/div/div'):
        province_name = div.xpath('./li/a/span/text()').get().title()   # 所有单词首字母大写
        province_href = div.xpath('./li/a/@href').get()
        if province_col.find_one({'province_name':province_name}) != None:
            continue
        else:
            province_col.insert_one({
                'province_name'  :province_name,
                'province_href'  :province_href,
                'province_status':None
            })
    print('【SUCCEED -- CRAWL PROVINCE SUCCEED】')
except:
    print('【ERROR   -- CRAWL PROVINCE FAILED 】')

#### 采集医院包含的所有页

In [None]:
province_status_list = []

for province in province_col.find():
    province_name   = province['province_name']
    province_href   = province['province_href']
    province_status = province['province_status']
    # 先判断是否已经采集过某省的医院了
    if province_status == 1:    # 如果 1        跳过该省
        continue
    else:                       # 如果 0 / None 采集该省的页
        try:
            selector = Selector(text = requests.get(province_href, headers=headers, proxies=random_proxies()).text)
            records  = int(selector.xpath('//p[@class="searchtext"][contains(text(), "Total Records")]/b[1]/text()').get())
            page     = int(selector.xpath('//p[@class="searchtext"][contains(text(), "Total Records")]/b[last()]/text()').get())
            if records <= 25:   # 如果该省总医院记录（records）小于 25，则该省总页数只有 1 页
                total_page_of_province = 1
            else:
                total_page_of_province = page
            total_page_href_of_province = [province_href[:-4] + f'-{str(i)}' + province_href[-4:] for i in range(1, total_page_of_province+1)]
            for href in total_page_href_of_province:
                all_page_col.insert_one({
                    'province_name'          : province_name,
                    'page_of_province'       : href,
                    'page_of_province_status': None,
                })
            province_status_list.append({'province_name':province_name,'province_new_status':1})
        except Exception as e:
            print('错误类型是',e.__class__.__name__)
            print('错误明细是',e)
            province_status_list.append({'province_name':province_name,'province_new_status':0})

# 统一 province_col 数据库的状态
for province in province_status_list:
    condition        = {'province_name':province['province_name']}
    update_status    = province_col.find_one(condition)
    update_status['province_status']  = province['province_new_status']
    province_col.update_one(condition,{'$set': update_status})

#### 采集每页的医院链接

In [None]:
for province in province_col.find():
    province_name = province['province_name']
    if province['province_status'] != 1:    
        # 如果状态为 0 / Null —— 说明没有这个省的数据（页），就不采集其医院信息
        continue
    else:           
        # 省的状态为 1 —— 采集该省的信息
        # print(f'|【PROVINCE】——Start Province {province_name}')
        for page in all_page_col.find({'province_name':province_name}):
            page_of_province        = page['page_of_province']
            page_of_province_status = page['page_of_province_status']
            # 更新用
            condition   = {'page_of_province':page_of_province}
            update_page = all_page_col.find_one(condition)

            if page_of_province_status == 1:    # 该页的状态为 1 —— 说明已经采集过了，跳过不采集
                continue
            else:                               # 该页的状态为 0 / None —— 采集
                print('|----|【PAGE】——Start Page', re.search('hospital_search/(.*?).htm', page_of_province).group(1).title())
                try:
                    page_res = requests.get(
                        url     = page_of_province, 
                        headers = headers, 
                        proxies = random_proxies()
                    )
                    selector = Selector(text = page_res.text)
                    # 这个爬虫逻辑似乎有问题，会多爬一些信息
                    hospital_list = selector.xpath('//div[@class="dr-lists"][1]//h3[@class="vert-small-margin"]/a')
                    for hospital in hospital_list:
                        hospital_belong_province = province_name
                        hospital_name   = hospital.xpath('./text()').re_first('(\w+.*?),').title()
                        hospital_href   = hospital.xpath('./@href').get()
                        hospital_status = None
                        hospital_col.insert_one({
                            'hospital_belong_province':hospital_belong_province,
                            'hospital_name'  :hospital_name,
                            'hospital_href'  :hospital_href,
                            'hospital_status':hospital_status,
                        })
                    # 更新页状态
                    update_page['page_of_province_status'] = 1
                    all_page_col.update_one(condition, {'$set':update_page})
                    # print(f'|----|----|【SCRAW PAGE SUCCEED】{page_of_province}')
                except Exception as e:
                    # 更新页状态
                    update_page['page_of_province_status'] = 0
                    all_page_col.update_one(condition, {'$set':update_page})
                    print(f'|----|----|⭕【SCRAW PAGE ERROR】{page_of_province}')
                    traceback.print_exc()
                    continue
        # print(f'|✅ {province_name}')

|----|【PAGE】——Start Page Hospitals-Telangana
|----|【PAGE】——Start Page Hospitals-Tripura


#### 采集医院的详细信息

In [None]:
# hospital_name = selector.xpath('//div[@class="mi-bg-1"]/../h2/text()').re_first('Address of (.*)')
# hospital_address       = ', '.join(selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p[1]//text()').re('\s*(\w.*\w)\s*,*'))
# hospital_director      = selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p/b[contains(text(), "Director")]/../text()').re_first('\s*(\w.*\w)\s*')
# hospital_email         = selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p/b[contains(text(), "Email")]/../span/text()').re_first('\s*(\w.*\w)\s*')
# hospital_phone         = selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p/b[contains(text(), "Phone")]/../span/text()').re_first('\s*(\w.*\w)\s*')
# hospital_mobile        = selector.xpath('//div[@class="mi-bg-1"]/div/div/div[contains(@class, "report-content")]/p/b[contains(text(), "Mobile")]/../span/text()').re_first('\s*(\w.*\s)\s*')

# hospital_detail_info = {
#     'hospital_name'    : hospital_name,
#     'hospital_address' : hospital_address,
#     'hospital_director': hospital_director,
#     'hospital_email'   : hospital_email,
#     'hospital_phone'   : hospital_phone,
#     'hospital_mobile'  : hospital_mobile,
#     'hospital_href' : None,
#     'hospital_html' : None,
# }

# 详细信息采集

### 准备

In [10]:
### 导库
import re
import json
import asyncio
import aiohttp
import time
import random
import requests
import traceback
import pymongo
from lxml import etree
from parsel import Selector



### UA & 随机代理
# headers = {
#     'sec-ch-ua' : '"Google Chrome";v="105", "Not)A;Brand";v="8", "Chromium";v="105"',
#     'sec-ch-ua-mobile'  : '?0',
#     'sec-ch-ua-platform': '"Windows"',
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
# }

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

def random_proxy():
    proxy = 'http://' + requests.get('http://127.0.0.1:5555/random').text.strip()
    return proxy

def random_proxies():
    proxies = {
        'http' : 'http://' + requests.get('http://127.0.0.1:5555/random').text.strip()
    }
    return proxies
print(random_proxy(), random_proxies())



### MongoDB连接
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['indianhospital']

province_col              = db['province']
all_page_col              = db['all_page']
hospital_col              = db['hospital']
hospital_detail_info_col  = db['hospital_detail_info']

http://45.152.188.62:3128 {'http': 'http://114.116.2.116:8001'}


### 采集

In [3]:
test_set  = hospital_col.find()[:10]
href_list = []
for each in test_set:
    href_list.append(each['hospital_href'])
href_list

['https://www.medindia.net/patients/hospital_search/chakraborty-hospital-port-blair-andaman-nicobar-95591-1.htm',
 'https://www.medindia.net/patients/hospital_search/chakraborty-multi-speciality-hospital-south-andaman-andaman-and-nicobar-islands-14966-1.htm',
 'https://www.medindia.net/patients/hospital_search/dragarwals-eye-hospital-port-blair-andaman-nicobar-68192-1.htm',
 'https://www.medindia.net/patients/hospital_search/inhs-dhanvantri-south-andaman-andaman-and-nicobar-islands-21526-1.htm',
 'https://www.medindia.net/patients/hospital_search/maricar-hospital-port-blair-andaman-and-nicobar-islands-25782-1.htm',
 'https://www.medindia.net/patients/hospital_search/pillar-health-centre-port-blair-andaman-and-nicobar-islands-26347-1.htm',
 'https://www.medindia.net/patients/hospital_search/welcare-ayurvedic-hospital-port-blair-andaman-nicobar-39346-1.htm',
 'https://www.medindia.net/patients/hospital_search/modern-eye-hospital-research-centre-nellore-andhra-pradesh-25693-1.htm',
 'http

# 测试

In [5]:
for href in href_list:
    try:
        res = requests.get(href, proxies=random_proxies(), headers=headers)
        print(res.status_code)
    except Exception as e:
        print(e)

200
200
200
200
200
200
200
200
200
200


In [41]:
import nest_asyncio
nest_asyncio.apply()

async def get_status(href):
    timeout = aiohttp.ClientTimeout(total = 10 )   # 超时设置
    async with aiohttp.ClientSession() as session:
        try:
            proxy = random.choice(proxy_list)
            print(proxy)
            async with session.get(url=href, proxy=proxy, headers=headers, timeout=timeout) as res:
                if res.status == 200:
                    print('PASS')
                    # html = await res.text()
                    # print(json.loads(html)['origin'])
        except Exception as e:
            # print(traceback.print_exc())
            print(e)
            pass
        print('-'*50)

def main():
    loop = asyncio.get_event_loop()
    tasks = [get_status(href) for href in href_list]
    # tasks = [get_status('http://httpbin.org/get') for i in range(100)]
    loop.run_until_complete(asyncio.wait(tasks))

if __name__ == '__main__':
    main()

http://140.250.89.243:20217
http://140.250.144.157:17514
http://125.106.131.203:23269
http://113.239.22.225:18267
http://36.6.159.21:23712
http://36.6.159.21:23712
http://114.237.61.80:19882
http://114.237.61.80:19882
http://60.168.207.71:17507
http://36.6.159.21:23712
Cannot connect to host 125.106.131.203:23269 ssl:default [Connect call failed ('125.106.131.203', 23269)]
--------------------------------------------------
Cannot connect to host 140.250.89.243:20217 ssl:default [Connect call failed ('140.250.89.243', 20217)]
--------------------------------------------------
Cannot connect to host 60.168.207.71:17507 ssl:default [Connect call failed ('60.168.207.71', 17507)]
--------------------------------------------------
Cannot connect to host 113.239.22.225:18267 ssl:default [Connect call failed ('113.239.22.225', 18267)]
--------------------------------------------------
Cannot connect to host 140.250.144.157:17514 ssl:default [Connect call failed ('140.250.144.157', 17514)]
----

In [37]:
iplist = [
    '36.6.159.21:23712',
    '140.250.144.157:17514',
    '183.143.61.223:18695',
    '112.66.244.193:22511',
    '125.106.131.203:23269',
    '140.250.89.243:20217',
    '114.237.61.80:19882',
    '113.239.22.225:18267',
    '60.168.207.71:17507',
    '58.52.48.39:21822',
]
proxy_list   = []
proxies_list = []
for ip in iplist:
    proxies = {
        'http' : 'http://'+ip,
        'https': 'https://'+ip
    }
    proxy = 'http://'+ip
    proxies_list.append(proxies)
    proxy_list.append(proxy)

random.choice(proxy_list)
random.choice(proxies_list)


'http://114.237.61.80:19882'