In [2]:
from py2neo import Graph,Node,Relationship,NodeMatcher
import pandas as pd
import numpy as np
from multiprocessing.pool import ThreadPool
import akshare as ak
import logging
from datetime import datetime, timedelta
from tqdm import tqdm
import os

pool = ThreadPool(10)
# 连接Neo4j数据库
graph = Graph('http://192.168.197.1:7474', auth=('neo4j', 'finglm-base-on-kg'), name='neo4j')
matcher = NodeMatcher(graph)

In [None]:
nodes = matcher.match('个股')

In [2]:
def get_logger(name):
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    
    # 以下两行是为了在jupyter notebook 中不重复输出日志
    if logger.root.handlers:
        logger.root.handlers[0].setLevel(logging.WARNING)
 
    handler_stdout = logging.StreamHandler()
    handler_stdout.setLevel(logging.INFO)
    handler_stdout.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler_stdout)
 
    handler_file = logging.FileHandler('log_file.log', encoding='utf-8')
    handler_file.setLevel(logging.DEBUG)
    handler_file.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler_file)
 
    return logger
log = get_logger(None)

# Construct financial knowledge graph of stock

In [3]:
# get stock data
df_stock_code = ak.stock_info_a_code_name()
stock_code_list = df_stock_code['code'].to_list()

  0%|          | 0/13 [00:00<?, ?it/s]

In [25]:
# create stock node constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:股票) REQUIRE n.代码 IS UNIQUE'
graph.run(cypher=cypher)

# create stock node
for code, name in df_stock_code.values:
    stock_node = Node('股票', 代码=code, 名称=name, name=name)
    try:
        graph.create(stock_node)
    except:
        continue

## 1. Financial Abstract

In [8]:
# create financial abstract constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:财务指标) REQUIRE (n.股票代码, n.指标类型) IS UNIQUE'
graph.run(cypher=cypher)
df_fin_ab_demo = ak.stock_financial_abstract(symbol='000001')
for abstract_type in df_fin_ab_demo['选项']:
    # create financial abstract indicators constraint
    cypher = f'CREATE CONSTRAINT IF NOT EXISTS FOR (n:{abstract_type}) REQUIRE (n.股票代码, n.报告期) IS UNIQUE'
    graph.run(cypher=cypher)

for code in tqdm(stock_code_list):
    stock = matcher.match("股票", 代码=code).first()
    fin_ab = matcher.match("财务指标", 股票代码=code, 指标类型='常用指标').first()
    if stock is not None and fin_ab is not None:
        continue
    # get financial abstract info
    try:
        df_fin_ab = ak.stock_financial_abstract(symbol=code).copy()
    except:
        log.warning(f'Can not get {code} financial abstract info')
        
    for abstract_type, df_group in df_fin_ab.groupby('选项'):
        # create financial abstract node
        fin_ab_node = Node('财务指标', 股票代码=code, 指标类型=abstract_type, name=abstract_type)
        try:
            graph.create(fin_ab_node)
        except:
            pass

        # create stock - financial abstract relationship
        stock = matcher.match("股票", 代码=code).first()
        fin_ab = matcher.match("财务指标", 股票代码=code, 指标类型=abstract_type).first()
        realation = Relationship(stock, '基本面', fin_ab)
        try:
            graph.create(realation)
        except:
            pass

        # create financial abstract indicators node
        df = df_group.drop('选项', axis=1).set_index('指标').copy()
        df = df.T.reset_index(drop=False, names='报告期').copy()
        df['股票代码'] = code
        df['报告期'] = pd.to_datetime(df['报告期'])
        df = df[df['报告期'] > datetime(2020,12,31)]
        df['报告期'] = df['报告期'].dt.strftime('%Y-%m-%d')
        df = df.fillna('null').copy()
        for record in df.to_dict('records'):
            record.update({'name': record['报告期']})
            fin_ab_indicator_node = Node(abstract_type, **record)
            try:
                graph.create(fin_ab_indicator_node)
            except:
                pass

        # create financial abstract-financial abstract indicators relationship
        fin_ab = matcher.match("财务指标",股票代码=code, 指标类型=abstract_type).first()
        fin_ab_indicator_ls = matcher.match(abstract_type, 股票代码=code)
        for fin_ab_indicator in fin_ab_indicator_ls: 
            realation = Relationship(fin_ab, '按报告期', fin_ab_indicator)
            try:
                graph.create(realation)
            except:
                pass

100%|█████████████████████████████████████████████████████████████████████████████| 5353/5353 [00:26<00:00, 201.50it/s]


## 2. Research Report

In [13]:
def _process_reasearch_report(df: pd.DataFrame):
    df['日期'] = pd.to_datetime(df['日期'])
    df = df[df['日期'] >= datetime.now() - timedelta(365)].copy()
    df_rsh_repo = df[['股票代码']].drop_duplicates().copy()
    df_rsh_repo['近一年研报数量'] = len(df)
    df_rsh_repo['买入/增持评级数量'] = len(df[df['东财评级'].isin(['买入', '增持'])])
    if df['2023-盈利预测-收益'].isna().sum() == len(df):
        df_rsh_repo['2023盈利预测-收益'] = 'null'
    else:
        df_rsh_repo['2023盈利预测-收益'] = df[~df['2023-盈利预测-收益'].isna()]['2023-盈利预测-收益'].mean()
    if df['2023-盈利预测-市盈率'].isna().sum() == len(df):
        df_rsh_repo['2023盈利预测-市盈率'] = 'null'
    else:
        df_rsh_repo['2023盈利预测-市盈率'] = df[~df['2023-盈利预测-市盈率'].isna()]['2023-盈利预测-市盈率'].mean()
    if df['2024-盈利预测-收益'].isna().sum() == len(df):
        df_rsh_repo['2024盈利预测-收益'] = 'null'
    else:
        df_rsh_repo['2024盈利预测-收益'] = df[~df['2024-盈利预测-收益'].isna()]['2024-盈利预测-收益'].mean()
    if df['2024-盈利预测-市盈率'].isna().sum() == len(df):
        df_rsh_repo['2024盈利预测-市盈率'] = 'null'
    else:
        df_rsh_repo['2024盈利预测-市盈率'] = df[~df['2024-盈利预测-市盈率'].isna()]['2024-盈利预测-市盈率'].mean()
    return df_rsh_repo

In [7]:
df = ak.stock_research_report_em(symbol='000001')

                                                                                                                       

In [14]:
_process_reasearch_report(df)

Unnamed: 0,股票代码,近一年研报数量,买入/增持评级数量,2023盈利预测-收益,2023盈利预测-市盈率,2024盈利预测-收益,2024盈利预测-市盈率
0,1,31,30,2.594118,4.413125,2.807391,3.991818


In [None]:
# create research report constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:个股研报) REQUIRE (n.股票代码) IS UNIQUE'
graph.run(cypher=cypher)


# get research report info
df_rsh_repo_ls = []
for code in tqdm(stock_code_list):
    try:
        df_rsh_repo = ak.stock_research_report_em(symbol=code).copy()
        df_rsh_repo = _process_reasearch_report(df_rsh_repo)
        df_rsh_repo_ls.append(df_rsh_repo)
    except:
        continue
        # log.warning(f'{code} has no research report info')
if len(df_rsh_repo_ls) == 0:
    raise ConnectionError('Can not get stock research report from akshare')
else:
    df_rsh_repo = pd.concat(df_rsh_repo_ls, ignore_index=True)
    
    
for record in tqdm(df_rsh_repo.to_dict('records')):
    # create research report nodeb
    rsh_report_node = Node('个股研报', name='个股研报', **record)
    try:
        graph.create(rsh_report_node)
    except:
        pass
    
    # create stock - main business composition relationship
    stock = matcher.match("股票", 代码=record['股票代码']).first()
    if stock is None:
        continue
    rsh_report = matcher.match("个股研报", 股票代码=record['股票代码']).first()
    realation = Relationship(stock, '基本面', rsh_report)
    try:
        graph.create(realation)
    except:
        pass

  0%|                                                                                         | 0/5356 [00:00<?, ?it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00, 13.80it/s][A
  0%|                                                                                 | 1/5356 [00:00<27:36,  3.23it/s][A
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00, 13.08it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 14.55it/s][A
  0%|                                                                                 | 2/5356 [00:00<30:51,  2.89it/s][A
0it [00:00, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 3135/3135 [00:55<00:00, 56.10it/s]


## 3. Main Business Composition

In [4]:
def assign_exchange_prefix(code: str):
    if code.startswith('0') or code.startswith('3'):
        code = 'SZ' + code
    else:
        code = 'SH' + code
    return code

In [5]:
# create main business composition constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:主营构成) REQUIRE (n.股票代码, n.报告期) IS UNIQUE'
graph.run(cypher=cypher)

# create main business constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:主营业务) REQUIRE (n.股票代码, n.报告期, n.业务名称) IS UNIQUE'
graph.run(cypher=cypher)


for code in tqdm(stock_code_list):
    # get main business info
    try:
        symbol = assign_exchange_prefix(code)
        df_main_business = ak.stock_zygc_em(symbol=symbol).copy()
        df_main_business = df_main_business.fillna('nan').copy()
    except:
        log.warning(f'Can not get {code} main business info')
        continue
        
    df_main_business = df_main_business[df_main_business['分类类型'] == '按产品分类'].copy()
    df_main_business['报告日期'] = pd.to_datetime(df_main_business['报告日期'])
    df_main_business = df_main_business[df_main_business['报告日期'] > datetime(2020,12,31)].copy()
    df_main_business['报告日期'] = df_main_business['报告日期'].dt.strftime('%Y-%m-%d')
    for report_time, df_group in df_main_business.groupby('报告日期'):
        # create main business composition node
        main_bus_compo_node = Node('主营构成', 股票代码=code, 报告期=report_time, name=report_time)
        try:
            graph.create(main_bus_compo_node)
        except:
            pass

        # create stock - main business composition relationship
        stock = matcher.match("股票", 代码=code).first()
        main_bus_compo = matcher.match("主营构成", 股票代码=code, 报告期=report_time).first()
        realation = Relationship(stock, '基本面', main_bus_compo)
        try:
            graph.create(realation)
        except:
            pass

        # create main business node
        df_group = df_group[['股票代码', '报告日期', '主营构成', '主营收入', '收入比例', '主营成本',
                             '成本比例', '主营利润', '利润比例', '毛利率']].copy()
        df_group = df_group.rename(columns={'主营构成': '业务名称', '报告日期': '报告期'})
        for record in df_group.to_dict('records'):
            record.update({'name': record['业务名称']})
            main_business_node = Node('主营业务', **record)
            try:
                graph.create(main_business_node)
            except:
                pass

        # create main business composition-main business relationship
        main_bus_compo = matcher.match("主营构成", 股票代码=code, 报告期=report_time).first()
        main_bus_ls = matcher.match("主营业务", 股票代码=code, 报告期=report_time)
        for main_bus in main_bus_ls: 
            realation = Relationship(main_bus_compo, '按产品分类', main_bus)
            try:
                graph.create(realation)
            except:
                pass

100%|████████████████████████████████████████████████████████████████████████████| 5353/5353 [1:24:07<00:00,  1.06it/s]


## 4. Main Shareholders

In [4]:
def assign_exchange_prefix(code: str):
    if code.startswith('0') or code.startswith('3'):
        code = 'sz' + code
    else:
        code = 'sh' + code
    return code

In [None]:
# create main shareholders constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:主要股东) REQUIRE (n.股票代码, n.报告期) IS UNIQUE'
graph.run(cypher=cypher)

# create shares holding constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:持股信息) REQUIRE (n.股票代码, n.报告期, n.股东名称) IS UNIQUE'
graph.run(cypher=cypher)

report_time_ls = ['20210331', '20210630', '20210930', '20211231', 
                  '20220331', '20220630', '20220930', '20221231', 
                  '20230331', '20230630', '20230930', '20231231']

for code in tqdm(stock_code_list):
    # get main shareholders info
    for report_time in report_time_ls:
        try:
            symbol = assign_exchange_prefix(code)
            df_main_shareholders = ak.stock_gdfx_top_10_em(symbol=symbol, date=report_time).copy()
        except:
            log.warning(f'Can not get {code} {report_time} main shareholders info')
            continue

        df_main_shareholders = df_main_shareholders[:3].fillna('nan').copy()
        # create main shareholders node
        main_shareholders_node = Node('主要股东', 股票代码=code, 报告期=report_time, name=report_time)
        try:
            graph.create(main_shareholders_node)
        except:
            pass

        # create stock - main shareholders relationship
        stock = matcher.match("股票", 代码=code).first()
        main_shareholders = matcher.match("主要股东", 股票代码=code, 报告期=report_time).first()
        realation = Relationship(stock, '基本面', main_shareholders)
        try:
            graph.create(realation)
        except:
            pass

        # create shares holding node
        for record in df_main_shareholders.to_dict('records'):
            record.update({'name': record['股东名称']})
            shares_holding = Node('持股信息', **record)
            try:
                graph.create(shares_holding)
            except:
                pass

        # create main business composition-main business relationship
        main_shareholders = matcher.match("主要股东", 股票代码=code, 报告期=report_time).first()
        shares_holding_ls = matcher.match("持股信息", 股票代码=code, 报告期=report_time)
        for shares_holding in shares_holding_ls: 
            realation = Relationship(main_shareholders, '按持股比例', shares_holding)
            try:
                graph.create(realation)
            except:
                pass

 38%|███████████████████████████▊                                              | 2011/5356 [2:31:49<8:16:48,  8.91s/it]