In [1]:
import logging
import re
from urllib.parse import urljoin,urlencode
import multiprocessing
import time
import json
from os import makedirs
from os.path import exists
import pandas as pd
from lxml import etree

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver import ChromeOptions

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s: %(message)s')

# 基础网址和爬取页码数量设定
BASE_URL='https://www.9fzt.com/marketCenter/aStockMarket.html?tab=0'
TOT_PAGEs=3

In [2]:
# %%
def get_browser():
    option=ChromeOptions()
    option.add_experimental_option('excludeSwitches',['enable-automation'])
    option.add_experimental_option('useAutomationExtension',False)
    # 设置不显式地显示浏览器
    option.add_argument('--headless')
    browser=webdriver.Chrome(options=option)
    browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
                            {'source': 'Object.defineProperty(navigator,"webdriver",{get:()=>undefind})'})
    browser.implicitly_wait(10)
    return browser


In [3]:
def parse_page(page_html,protol_type='https:'):
    pattern=re.compile('序号(.*)',re.S)
    stock_urls=re.findall(pattern,page_html)
    pattern=re.compile('<a href="(.*?)".*? class="bluelink ff_din-medium fw-500" target="_blank" rel="noopener">',re.S)
    stock_urls=re.findall(pattern,str(stock_urls))
    stock_urls=[urljoin(protol_type,url) for url in stock_urls]
    return stock_urls


In [4]:
# %%
def scrape_stock_list(browser,page_num):
    browser.get(BASE_URL)
    urls=[]
    WebDriverWait(browser,20,0.5).until(lambda browser:len(
        browser.find_element(By.XPATH,'//*[@id="__next"]/div/div[3]/div[2]/ul[20]/li[13]/span').text)>0)
    for page in range(page_num):
        urls+=(parse_page(browser.page_source))
        time.sleep(0.1)
        ac=ActionChains(browser)
        # 鼠标移动到下一页按钮上
        ac.move_to_element(browser.find_element_by_name('whj_nextPage')).perform()
        # 点击确定跳转至下一页
        ac.click(browser.find_element_by_name('whj_nextPage')).perform()
        time.sleep(0.1)
        WebDriverWait(browser,10,0.5).until(lambda browser: len(
            browser.find_element(By.XPATH,'//*[@id="__next"]/div/div[3]/div[2]/ul[20]/li[13]/span').text)>0)
    print(urls)
    return urls


In [5]:
# %%
def scrape_stockprice_page(browser,url):
    logging.info('scraping %s...',url)
    browser.get(url)
    WebDriverWait(browser,60,2).until(lambda browser: browser.find_element_by_id('stockprice').text != '--')
    return browser.page_source


In [6]:
# %%
def scrape_company_list(browser,urls):
    companys=[]
    for url in urls:
        logging.info('scraping %s...',url)
        browser.get(url)
        logging.info('scraping 最新动态...')
        browser.find_element_by_link_text('最新动态').click()
        browser.switch_to.window(browser.window_handles[1])
        WebDriverWait(browser,10,2).until(lambda browser: len(
           browser.find_element(By.XPATH,'//*[@id="gsgy"]/div[2]/div/div/section/div/ul/li[2]/div[2]/div/span').text)>0)
        shareholder=browser.page_source
        browser.close()
        browser.switch_to.window(browser.window_handles[0])
        #shareholder = None

        companys.append([shareholder])
    return companys


In [7]:
# 使用xpath匹配
concept_xpath='//*[@id="gsgy"]/div[2]/div/div/section/div/ul/li[2]/div[2]/div/span/text()'

def parse_company_data(html):
    html=etree.HTML(html)
    
    concept=html.xpath(concept_xpath)[0]

    return [concept]


In [8]:
# %%
def get_table_company_data(data):
    table=pd.DataFrame(columns=['concept'])
    for item in data:
        table.loc[len(table)]=parse_company_data(item)
    return table


In [9]:
# %%
browser=get_browser()
urls=scrape_stock_list(browser,TOT_PAGEs)
#urls=['https://stock.9fzt.com/index/bj_870204.html']

companys=scrape_company_list(browser,urls)
data=[company[0] for company in companys]

company_data_table=get_table_company_data(data)
#print(company_data_table)

company_data_table.to_csv('concept_data_table.csv',index=False,encoding='gbk')

# TO-DO 其他可以保存的数据信息
# price_table = get_table_price(browser, urls)
# price_table.to_csv('price_table.csv', index=False)

2023-07-04 15:51:45,512 - INFO: scraping https://stock.9fzt.com/index/sz_301488.html...


['https://stock.9fzt.com/index/sz_301488.html', 'https://stock.9fzt.com/index/bj_833533.html', 'https://stock.9fzt.com/index/sz_301007.html', 'https://stock.9fzt.com/index/sz_301255.html', 'https://stock.9fzt.com/index/sz_300489.html', 'https://stock.9fzt.com/index/sz_301221.html', 'https://stock.9fzt.com/index/sz_300552.html', 'https://stock.9fzt.com/index/sh_688280.html', 'https://stock.9fzt.com/index/sz_300503.html', 'https://stock.9fzt.com/index/sz_300549.html', 'https://stock.9fzt.com/index/sh_688331.html', 'https://stock.9fzt.com/index/bj_831278.html', 'https://stock.9fzt.com/index/sh_688147.html', 'https://stock.9fzt.com/index/sh_688123.html', 'https://stock.9fzt.com/index/bj_836221.html', 'https://stock.9fzt.com/index/sh_688326.html', 'https://stock.9fzt.com/index/sz_301099.html', 'https://stock.9fzt.com/index/sh_688071.html', 'https://stock.9fzt.com/index/sz_300496.html', 'https://stock.9fzt.com/index/sz_301183.html', 'https://stock.9fzt.com/index/sz_301023.html', 'https://sto

2023-07-04 15:51:46,939 - INFO: scraping 最新动态...
2023-07-04 15:51:49,443 - INFO: scraping https://stock.9fzt.com/index/bj_833533.html...
2023-07-04 15:51:50,250 - INFO: scraping 最新动态...
2023-07-04 15:51:52,128 - INFO: scraping https://stock.9fzt.com/index/sz_301007.html...
2023-07-04 15:51:52,789 - INFO: scraping 最新动态...
2023-07-04 15:51:54,608 - INFO: scraping https://stock.9fzt.com/index/sz_301255.html...
2023-07-04 15:51:55,173 - INFO: scraping 最新动态...
2023-07-04 15:51:56,539 - INFO: scraping https://stock.9fzt.com/index/sz_300489.html...
2023-07-04 15:51:57,489 - INFO: scraping 最新动态...
2023-07-04 15:51:59,242 - INFO: scraping https://stock.9fzt.com/index/sz_301221.html...
2023-07-04 15:52:00,145 - INFO: scraping 最新动态...
2023-07-04 15:52:01,807 - INFO: scraping https://stock.9fzt.com/index/sz_300552.html...
2023-07-04 15:52:02,582 - INFO: scraping 最新动态...
2023-07-04 15:52:04,428 - INFO: scraping https://stock.9fzt.com/index/sh_688280.html...
2023-07-04 15:52:05,103 - INFO: scraping 