In [1]:
import logging
import re
from urllib.parse import urljoin,urlencode
import multiprocessing
import time
import json
from os import makedirs
from os.path import exists
import pandas as pd
from lxml import etree

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver import ChromeOptions

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s: %(message)s')

# 基础网址和爬取页码数量设定
BASE_URL='https://www.9fzt.com/marketCenter/aStockMarket.html?tab=0'
TOT_PAGEs=3

In [2]:
# %%
def get_browser():
    option=ChromeOptions()
    option.add_experimental_option('excludeSwitches',['enable-automation'])
    option.add_experimental_option('useAutomationExtension',False)
    # 设置不显式地显示浏览器
    option.add_argument('--headless')
    browser=webdriver.Chrome(options=option)
    browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
                            {'source': 'Object.defineProperty(navigator,"webdriver",{get:()=>undefind})'})
    browser.implicitly_wait(10)
    return browser

In [3]:
# %%
# 正则表达式测试地址：http://tool.oschina.net/regex/
def parse_page(page_html,protol_type='https:'):
    pattern=re.compile('序号(.*)',re.S)
    stock_urls=re.findall(pattern,page_html)
    pattern=re.compile('<a href="(.*?)".*? class="bluelink ff_din-medium fw-500" target="_blank" rel="noopener">',re.S)
    stock_urls=re.findall(pattern,str(stock_urls))
    stock_urls=[urljoin(protol_type,url) for url in stock_urls]
    return stock_urls

In [4]:
# %%
def scrape_stock_list(browser,page_num):
    browser.get(BASE_URL)
    urls=[]
    WebDriverWait(browser,20,0.5).until(lambda browser:len(
        browser.find_element(By.XPATH,'//*[@id="__next"]/div/div[3]/div[2]/ul[20]/li[13]/span').text)>0)
    for page in range(page_num):
        urls+=(parse_page(browser.page_source))
        time.sleep(0.1)
        ac=ActionChains(browser)
        # 鼠标移动到下一页按钮上
        ac.move_to_element(browser.find_element_by_name('whj_nextPage')).perform()
        # 点击确定跳转至下一页
        ac.click(browser.find_element_by_name('whj_nextPage')).perform()
        time.sleep(0.1)
        WebDriverWait(browser,10,0.5).until(lambda browser: len(
            browser.find_element(By.XPATH,'//*[@id="__next"]/div/div[3]/div[2]/ul[20]/li[13]/span').text)>0)
    print(urls)
    return urls

In [5]:
# %%
def scrape_stockprice_page(browser,url):
    logging.info('scraping %s...',url)
    browser.get(url)
    WebDriverWait(browser,60,2).until(lambda browser: browser.find_element_by_id('stockprice').text != '--')
    return browser.page_source

In [6]:
# %%
def scrape_company_list(browser,urls):
    companys=[]
    for url in urls:
        logging.info('scraping %s...',url)
        browser.get(url)

        logging.info('scraping 公司资料...')
        browser.find_element_by_link_text('公司资料').click()
        browser.switch_to.window(browser.window_handles[1])
        WebDriverWait(browser,10,2).until(lambda browser: len(
            browser.find_element(By.XPATH,'//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[4]/div[2]/section/div/div').text)>5)
        data=browser.page_source
        browser.close()
        browser.switch_to.window(browser.window_handles[0])
        
        shareholder = None

        companys.append([data,shareholder])
    return companys


In [7]:
# 使用xpath匹配
stock_code_name_xpath='//*[@id="app"]/div/section/div[1]/section/div[1]/div/div/div[1]/h3/i/text()'
company_name_xpath='//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[1]/div[2]/div/span/text()'
company_time_xpath='//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[5]/div[2]/div/span/text()'
company_business_xpath='//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[23]/div[2]/div/span/text()'
company_representative_xpath='//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[9]/div[2]/div/span/text()'
company_manager_xpath='//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[10]/div[2]/div/span/text()'
company_secretary_xpath='//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[11]/div[2]/div/span/text()'
company_phone_xpath='//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[14]/div[2]/div/span/text()'
company_place_pattern='//*[@id="xxqk"]/div[2]/div/div/section/div/ul/li[18]/div[2]/div/span/text()'

# 使用re匹配
city_pattern=re.compile('(.*?省)?(.*?市)?',re.S)


def parse_company_data(html):
    html=etree.HTML(html)

    code_name = html.xpath(stock_code_name_xpath)[0].split(' ')
    stock_code = code_name[0]
    stock_name = code_name[1]

    company_place=html.xpath(company_place_pattern)[0]

    province='--'
    city='--'
    if re.search(city_pattern,company_place).group(1):
        province=re.search(city_pattern,company_place).group(1)
    if re.search(city_pattern,company_place).group(2):
        city=re.search(city_pattern,company_place).group(2)

    company_name=html.xpath(company_name_xpath)[0]
    company_time=html.xpath(company_time_xpath)[0]
    company_business=html.xpath(company_business_xpath)[0]
    company_representative=html.xpath(company_representative_xpath)[0]
    company_manager=html.xpath(company_manager_xpath)[0]
    company_secretary=html.xpath(company_secretary_xpath)[0]
    company_phone=html.xpath(company_phone_xpath)[0]

    return [stock_code,stock_name,company_name,province,city,company_time,company_business,company_representative,
            company_manager,company_secretary,company_phone]


In [8]:
# %%
def get_table_company_data(data):
    table=pd.DataFrame(columns=['stock_code','stock_name','company_name','province','city',
                                'company_time','company_business','company_representative',
                                'company_manager','company_secretary','company_phone'])
    for item in data:
        table.loc[len(table)]=parse_company_data(item)
    return table


In [9]:
# %%
browser=get_browser()
urls=scrape_stock_list(browser,TOT_PAGEs)
# urls=['https://stock.9fzt.com/index/bj_870204.html']

companys=scrape_company_list(browser,urls)
data=[company[0] for company in companys]

company_data_table=get_table_company_data(data)
company_data_table.to_csv('company_data_table.csv',index=False,encoding='gbk')

2023-07-04 19:00:51,078 - INFO: scraping https://stock.9fzt.com/index/sz_301488.html...


['https://stock.9fzt.com/index/sz_301488.html', 'https://stock.9fzt.com/index/bj_833533.html', 'https://stock.9fzt.com/index/sz_301007.html', 'https://stock.9fzt.com/index/sz_301255.html', 'https://stock.9fzt.com/index/sz_300489.html', 'https://stock.9fzt.com/index/sz_301221.html', 'https://stock.9fzt.com/index/sz_300552.html', 'https://stock.9fzt.com/index/sh_688280.html', 'https://stock.9fzt.com/index/sz_300503.html', 'https://stock.9fzt.com/index/sz_300549.html', 'https://stock.9fzt.com/index/sh_688331.html', 'https://stock.9fzt.com/index/bj_831278.html', 'https://stock.9fzt.com/index/sh_688147.html', 'https://stock.9fzt.com/index/sh_688123.html', 'https://stock.9fzt.com/index/bj_836221.html', 'https://stock.9fzt.com/index/sh_688326.html', 'https://stock.9fzt.com/index/sz_301099.html', 'https://stock.9fzt.com/index/sh_688071.html', 'https://stock.9fzt.com/index/sz_300496.html', 'https://stock.9fzt.com/index/sz_301183.html', 'https://stock.9fzt.com/index/sz_301488.html', 'https://sto

2023-07-04 19:00:53,112 - INFO: scraping 公司资料...
2023-07-04 19:00:57,451 - INFO: scraping https://stock.9fzt.com/index/bj_833533.html...
2023-07-04 19:00:58,232 - INFO: scraping 公司资料...
2023-07-04 19:01:01,195 - INFO: scraping https://stock.9fzt.com/index/sz_301007.html...
2023-07-04 19:01:01,832 - INFO: scraping 公司资料...
2023-07-04 19:01:05,487 - INFO: scraping https://stock.9fzt.com/index/sz_301255.html...
2023-07-04 19:01:06,204 - INFO: scraping 公司资料...
2023-07-04 19:01:09,312 - INFO: scraping https://stock.9fzt.com/index/sz_300489.html...
2023-07-04 19:01:09,969 - INFO: scraping 公司资料...
2023-07-04 19:01:13,628 - INFO: scraping https://stock.9fzt.com/index/sz_301221.html...
2023-07-04 19:01:14,233 - INFO: scraping 公司资料...
2023-07-04 19:01:17,460 - INFO: scraping https://stock.9fzt.com/index/sz_300552.html...
2023-07-04 19:01:18,021 - INFO: scraping 公司资料...
2023-07-04 19:01:21,670 - INFO: scraping https://stock.9fzt.com/index/sh_688280.html...
2023-07-04 19:01:22,272 - INFO: scraping 

2023-07-04 19:04:54,914 - INFO: scraping 公司资料...
2023-07-04 19:04:58,243 - INFO: scraping https://stock.9fzt.com/index/sz_000020.html...
2023-07-04 19:05:00,068 - INFO: scraping 公司资料...
2023-07-04 19:05:03,646 - INFO: scraping https://stock.9fzt.com/index/sh_603015.html...
2023-07-04 19:05:04,488 - INFO: scraping 公司资料...
2023-07-04 19:05:07,629 - INFO: scraping https://stock.9fzt.com/index/sh_603045.html...
2023-07-04 19:05:08,593 - INFO: scraping 公司资料...
2023-07-04 19:05:12,231 - INFO: scraping https://stock.9fzt.com/index/sh_603085.html...
2023-07-04 19:05:13,059 - INFO: scraping 公司资料...
2023-07-04 19:05:16,987 - INFO: scraping https://stock.9fzt.com/index/sz_002403.html...
2023-07-04 19:05:18,132 - INFO: scraping 公司资料...
2023-07-04 19:05:21,573 - INFO: scraping https://stock.9fzt.com/index/sz_301052.html...
2023-07-04 19:05:22,530 - INFO: scraping 公司资料...
2023-07-04 19:05:26,031 - INFO: scraping https://stock.9fzt.com/index/sz_002976.html...
2023-07-04 19:05:26,982 - INFO: scraping 